From 00bcafe82e8a850b993120cd04fc37ce68a9a04b Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Mon, 31 Mar 2025 16:21:07 +0200
Subject: [PATCH 001/140] chore(ci): upgrade stats action with docker images
 from ghcr.io (#11378)

## Problem
Current version of GitHub Workflow Stats action pull docker images from
DockerHub, that could be an issue with the new pull limits on DockerHub
side.

## Summary of changes
Switch to version `v0.2.2`, with docker images hosted on `ghcr.io`
---
 .github/workflows/report-workflow-stats-batch.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml
index 0f52d24787..6e5093ebd6 100644
--- a/.github/workflows/report-workflow-stats-batch.yml
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -23,7 +23,7 @@ jobs:
         egress-policy: audit
 
     - name: Export Workflow Run for the past 2 hours
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
       with:
         db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
         db_table: "gh_workflow_stats_neon"
@@ -43,7 +43,7 @@ jobs:
         egress-policy: audit
 
     - name: Export Workflow Run for the past 48 hours
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
       with:
         db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
         db_table: "gh_workflow_stats_neon"
@@ -63,7 +63,7 @@ jobs:
         egress-policy: audit
 
     - name: Export Workflow Run for the past 30 days
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
       with:
         db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
         db_table: "gh_workflow_stats_neon"

From 0ee5bfa2fc01372737fe0e108ae40b3e6f5801a7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 31 Mar 2025 12:32:55 -0400
Subject: [PATCH 002/140] fix(pageserver): allow sibling archived branch for
 detaching (#11383)

## Problem

close https://github.com/neondatabase/neon/issues/11379

## Summary of changes

Remove checks around archived branches for detach v2. I also updated the
comments `ancestor_retain_lsn`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/timeline/detach_ancestor.rs    | 65 ++++++++++++-------
 .../regress/test_timeline_detach_ancestor.py  | 33 ++++++++--
 2 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index ac9d9a4579..ca1d81c691 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -235,7 +235,7 @@ pub(super) async fn prepare(
         return Err(NoAncestor);
     }
 
-    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn, behavior)?;
 
     if let DetachBehavior::MultiLevelAndNoReparent = behavior {
         // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline.
@@ -249,7 +249,13 @@ pub(super) async fn prepare(
             ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable
             ancestor = ancestor_of_ancestor;
             // TODO: do we still need to check if we don't want to reparent?
-            check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+            check_no_archived_children_of_ancestor(
+                tenant,
+                detached,
+                &ancestor,
+                ancestor_lsn,
+                behavior,
+            )?;
         }
     } else if ancestor.ancestor_timeline.is_some() {
         // non-technical requirement; we could flatten N ancestors just as easily but we chose
@@ -1156,31 +1162,44 @@ fn check_no_archived_children_of_ancestor(
     detached: &Arc<Timeline>,
     ancestor: &Arc<Timeline>,
     ancestor_lsn: Lsn,
+    detach_behavior: DetachBehavior,
 ) -> Result<(), Error> {
-    let timelines = tenant.timelines.lock().unwrap();
-    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-    for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
-        if timeline.is_archived() == Some(true) {
-            return Err(Error::Archived(timeline.timeline_id));
-        }
-    }
-    for timeline_offloaded in timelines_offloaded.values() {
-        if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
-            continue;
-        }
-        // This forbids the detach ancestor feature if flattened timelines are present,
-        // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
-        // But as per current design, we don't record the ancestor_lsn of flattened timelines.
-        // This is a bit unfortunate, but as of writing this we don't support flattening
-        // anyway. Maybe we can evolve the data model in the future.
-        if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
-            let is_earlier = retain_lsn <= ancestor_lsn;
-            if !is_earlier {
-                continue;
+    match detach_behavior {
+        DetachBehavior::NoAncestorAndReparent => {
+            let timelines = tenant.timelines.lock().unwrap();
+            let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+
+            for timeline in
+                reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn)
+            {
+                if timeline.is_archived() == Some(true) {
+                    return Err(Error::Archived(timeline.timeline_id));
+                }
+            }
+
+            for timeline_offloaded in timelines_offloaded.values() {
+                if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
+                    continue;
+                }
+                // This forbids the detach ancestor feature if flattened timelines are present,
+                // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
+                // But as per current design, we don't record the ancestor_lsn of flattened timelines.
+                // This is a bit unfortunate, but as of writing this we don't support flattening
+                // anyway. Maybe we can evolve the data model in the future.
+                if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
+                    let is_earlier = retain_lsn <= ancestor_lsn;
+                    if !is_earlier {
+                        continue;
+                    }
+                }
+                return Err(Error::Archived(timeline_offloaded.timeline_id));
             }
         }
-        return Err(Error::Archived(timeline_offloaded.timeline_id));
+        DetachBehavior::MultiLevelAndNoReparent => {
+            // We don't need to check anything if the user requested to not reparent.
+        }
     }
+
     Ok(())
 }
 
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 2a916438e5..34c251285f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -343,7 +343,8 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
 
-def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("snapshots_archived", ["archived", "normal"])
+def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots_archived: str):
     """
     Test the v2 behavior of ancestor detach.
 
@@ -385,6 +386,11 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder):
 
         ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
 
+        branchpoint_y = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+        client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+        ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);")
+
         branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
         client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
 
@@ -395,6 +401,10 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder):
         "earlier", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe
     )
 
+    snapshot_branchpoint_old = env.create_branch(
+        "snapshot_branchpoint_old", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_y
+    )
+
     snapshot_branchpoint = env.create_branch(
         "snapshot_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x
     )
@@ -407,19 +417,32 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder):
 
     after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None)
 
+    if snapshots_archived == "archived":
+        # archive the previous snapshot branchpoint
+        client.timeline_archival_config(
+            env.initial_tenant, snapshot_branchpoint_old, TimelineArchivalState.ARCHIVED
+        )
+
     all_reparented = client.detach_ancestor(
         env.initial_tenant, branch_to_detach, detach_behavior="v2"
     )
     assert set(all_reparented) == set()
 
+    if snapshots_archived == "archived":
+        # restore the branchpoint so that we can query from the endpoint
+        client.timeline_archival_config(
+            env.initial_tenant, snapshot_branchpoint_old, TimelineArchivalState.UNARCHIVED
+        )
+
     env.pageserver.quiesce_tenants()
 
     # checking the ancestor after is much faster than waiting for the endpoint not start
     expected_result = [
-        ("main", env.initial_timeline, None, 16384, 1),
-        ("after", after, env.initial_timeline, 16384, 1),
-        ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 8192, 1),
-        ("branch_to_detach", branch_to_detach, None, 8192, 1),
+        ("main", env.initial_timeline, None, 24576, 1),
+        ("after", after, env.initial_timeline, 24576, 1),
+        ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1),
+        ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1),
+        ("branch_to_detach", branch_to_detach, None, 16384, 1),
         ("earlier", earlier, env.initial_timeline, 0, 1),
     ]
 

From e5b95bc9dc20b288a8dfe463d2a9e1d919dc68cb Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 31 Mar 2025 19:04:00 +0200
Subject: [PATCH 003/140] Neon LFC/prefetch: Improve page read handling
 (#11380)

Previously we had different meanings for the bitmask of vector IOps.
That has now been unified to "bit set = final result, no more
scribbling".

Furthermore, the LFC read path scribbled on pages that were already
read; that's probably not a good thing so that's been fixed too. In
passing, the read path of LFC has been updated to read only the
requested pages into the provided buffers, thus reducing the IO size of
vectorized IOs.

## Problem

## Summary of changes
---
 pgxn/neon/file_cache.c       | 88 +++++++++++++++++++++++++++++-------
 pgxn/neon/pagestore_client.h |  2 +-
 pgxn/neon/pagestore_smgr.c   | 49 +++++++++-----------
 3 files changed, 94 insertions(+), 45 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index e555e069d0..91f5eb272a 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -647,18 +647,25 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return found;
 }
 
+#if PG_MAJORVERSION_NUM >= 16
+static PGIOAlignedBlock voidblock = {0};
+#else
+static PGAlignedBlock voidblock = {0};
+#endif
+#define SCRIBBLEPAGE (&voidblock.data)
+
 /*
  * Try to read pages from local cache.
  * Returns the number of pages read from the local cache, and sets bits in
- * 'read' for the pages which were read. This may scribble over buffers not
- * marked in 'read', so be careful with operation ordering.
+ * 'mask' for the pages which were read. This may scribble over buffers not
+ * marked in 'mask', so be careful with operation ordering.
  *
  * In case of error local file cache is disabled (lfc->limit is set to zero),
- * and -1 is returned. Note that 'read' and the buffers may be touched and in
- * an otherwise invalid state.
+ * and -1 is returned.
  *
- * If the mask argument is supplied, bits will be set at the offsets of pages
- * that were present and read from the LFC.
+ * If the mask argument is supplied, we'll only try to read those pages which
+ * don't have their bits set on entry. At exit, pages which were successfully
+ * read from LFC will have their bits set.
  */
 int
 lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
@@ -693,23 +700,43 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
-		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+		int8	chunk_mask[BLOCKS_PER_CHUNK / 8] = {0};
+		int		chunk_offs = (blkno & (BLOCKS_PER_CHUNK - 1));
 		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
 		uint64	io_time_us = 0;
-		int     n_blocks_to_read = 0;
+		int		n_blocks_to_read = 0;
+		int		iov_last_used = 0;
+		int		first_block_in_chunk_read = -1;
 		ConditionVariable* cv;
 
 		Assert(blocks_in_chunk > 0);
 
 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
-			n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0);
-			iov[i].iov_base = buffers[buf_offset + i];
 			iov[i].iov_len = BLCKSZ;
-			BITMAP_CLR(mask,  buf_offset + i);
+			/* mask not set = we must do work */
+			if (!BITMAP_ISSET(mask, buf_offset + i))
+			{
+				iov[i].iov_base = buffers[buf_offset + i];
+				n_blocks_to_read++;
+				iov_last_used = i + 1;
+
+				if (first_block_in_chunk_read == -1)
+				{
+					first_block_in_chunk_read = i;
+				}
+			}
+			/* mask set = we must do no work */
+			else
+			{
+				/* don't scribble on pages we weren't requested to write to */
+				iov[i].iov_base = SCRIBBLEPAGE;
+			}
 		}
+
+		/* shortcut IO */
 		if (n_blocks_to_read == 0)
 		{
 			buf_offset += blocks_in_chunk;
@@ -718,6 +745,12 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			continue;
 		}
 
+		/*
+		 * The effective iov size must be >= the number of blocks we're about
+		 * to read.
+		 */
+		Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read);
+
 		tag.blockNum = blkno - chunk_offs;
 		hash = get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];
@@ -762,10 +795,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
 
-		for (int i = 0; i < blocks_in_chunk; i++)
+		for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
 		{
 			FileCacheBlockState state = UNAVAILABLE;
 			bool sleeping = false;
+
+			/* no need to work on something we're not interested in */
+			if (BITMAP_ISSET(mask, buf_offset + i))
+				continue;
+
 			while (lfc_ctl->generation == generation)
 			{
 				state = GET_STATE(entry, chunk_offs + i);
@@ -789,7 +827,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			}
 			if (state == AVAILABLE)
 			{
-				BITMAP_SET(mask, buf_offset + i);
+				BITMAP_SET(chunk_mask, i);
 				iteration_hits++;
 			}
 			else
@@ -801,16 +839,34 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 		if (iteration_hits != 0)
 		{
+			/* chunk offset (# of pages) into the LFC file */
+			off_t	first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK;
+			int		nwrite = iov_last_used - first_block_in_chunk_read;
+			/* offset of first IOV */
+			first_read_offset += chunk_offs + first_block_in_chunk_read;
+
 			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
-			rc = preadv(lfc_desc, iov, blocks_in_chunk,
-						((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+
+			/* Read only the blocks we're interested in, limiting */
+			rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
+						nwrite, first_read_offset * BLCKSZ);
 			pgstat_report_wait_end();
 
-			if (rc != (BLCKSZ * blocks_in_chunk))
+			if (rc != (BLCKSZ * nwrite))
 			{
 				lfc_disable("read");
 				return -1;
 			}
+
+			/*
+			 * We successfully read the pages we know were valid when we
+			 * started reading; now mark those pages as read
+			 */
+			for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
+			{
+				if (BITMAP_ISSET(chunk_mask, i))
+					BITMAP_SET(mask, buf_offset + i);
+			}
 		}
 
 		/* Place entry to the head of LRU list */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 475697f9c0..68f7430343 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -315,7 +315,7 @@ static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 void *buffer)
 {
-	bits8		rv = 1;
+	bits8		rv = 0;
 	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
 }
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index ddcee74ff3..2424a5fcb6 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1081,6 +1081,9 @@ prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_r
  * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
  * to calculate the LSNs to send.
  *
+ * Bits set in *mask (if present) indicate pages already read; i.e. pages we
+ * can skip in this process.
+ *
  * When performing a prefetch rather than a synchronous request,
  * is_prefetch==true. Currently, it only affects how the request is accounted
  * in the perf counters.
@@ -1126,7 +1129,7 @@ Retry:
 		uint64		ring_index;
 		neon_request_lsns *lsns;
 
-		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
 			continue;
 
 		if (frlsns)
@@ -3026,9 +3029,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 		tag.blockNum = blocknum;
 
-		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			lfc_present[i] = ~(lfc_present[i]);
-
 		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
 											   lfc_present, true);
 
@@ -3134,6 +3134,15 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 #endif
 }
 
+/*
+ * Read N pages at a specific LSN.
+ *
+ * *mask is set for pages read at a previous point in time, and which we
+ * should not touch, nor overwrite.
+ * New bits should be set in *mask for the pages we'successfully read.
+ *
+ * The offsets in request_lsns, buffers, and mask are linked.
+ */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
@@ -3186,7 +3195,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 		neon_request_lsns *reqlsns = &request_lsns[i];
 		TimestampTz		start_ts, end_ts;
 
-		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
 			continue;
 
 		start_ts = GetCurrentTimestamp();
@@ -3485,9 +3494,7 @@ static void
 neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   void **buffers, BlockNumber nblocks)
 {
-	bits8		prefetch_hits[PG_IOV_MAX / 8] = {0};
-	bits8		lfc_hits[PG_IOV_MAX / 8];
-	bits8		read[PG_IOV_MAX / 8];
+	bits8		read_pages[PG_IOV_MAX / 8];
 	neon_request_lsns request_lsns[PG_IOV_MAX];
 	int			lfc_result;
 	int			prefetch_result;
@@ -3519,19 +3526,18 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 						  request_lsns, nblocks);
 
+	memset(read_pages, 0, sizeof(read_pages));
 
-	prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits);
+	prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
+									   blocknum, request_lsns, nblocks,
+									   buffers, read_pages);
 
 	if (prefetch_result == nblocks)
 		return;
 
-	/* invert the result: exclude prefetched blocks */
-	for (int i = 0; i < PG_IOV_MAX / 8; i++)
-		lfc_hits[i] = ~prefetch_hits[i];
-
 	/* Try to read from local file cache */
 	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
-								  nblocks, lfc_hits);
+								  nblocks, read_pages);
 
 	if (lfc_result > 0)
 		MyNeonCounters->file_cache_hits_total += lfc_result;
@@ -3540,21 +3546,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (prefetch_result + lfc_result == nblocks)
 		return;
 
-	if (lfc_result <= 0)
-	{
-		/* can't use the LFC result, so read all blocks from PS */
-		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			read[i] = ~prefetch_hits[i];
-	}
-	else
-	{
-		/* invert the result: exclude blocks read from lfc */
-		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			read[i] = ~(prefetch_hits[i] | lfc_hits[i]);
-	}
-
 	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
-					  buffers, nblocks, read);
+					  buffers, nblocks, read_pages);
 
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.

From 47d47000dfd5cdbe4425eaae43f955974f1595d7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 31 Mar 2025 15:16:42 -0400
Subject: [PATCH 004/140] fix(pageserver): passthrough lsn lease in storcon API
 (#11386)

## Problem

part of https://github.com/neondatabase/cloud/issues/23667

## Summary of changes

lsn_lease API can only be used on pageservers. This patch enables
storcon passthrough.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/http.rs              | 43 +++++++++++-
 storage_controller/src/pageserver_client.rs | 19 +++++-
 storage_controller/src/service.rs           | 75 ++++++++++++++++++++-
 test_runner/regress/test_tenant_size.py     | 25 +++++++
 4 files changed, 156 insertions(+), 6 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index a5e00e18e8..79332ea304 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -24,9 +24,9 @@ use pageserver_api::controller_api::{
     ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::models::{
-    DetachBehavior, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
-    TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
-    TimelineCreateRequest,
+    DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
+    TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest,
+    TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
@@ -582,6 +582,32 @@ async fn handle_tenant_timeline_download_heatmap_layers(
     json_response(StatusCode::OK, ())
 }
 
+async fn handle_tenant_timeline_lsn_lease(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let lsn_lease_request = json_request::<LsnLeaseRequest>(&mut req).await?;
+
+    service
+        .tenant_timeline_lsn_lease(tenant_id, timeline_id, lsn_lease_request.lsn)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 // For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters
 // and tenant/timeline IDs.  Since we are proxying to arbitrary paths, we don't have routing templates to
 // compare to, so we can just filter out our well known ID format with regexes.
@@ -2192,6 +2218,17 @@ pub fn make_router(
                 )
             },
         )
+        // LSN lease passthrough to all shards
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_lsn_lease,
+                    RequestName("v1_tenant_timeline_lsn_lease"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index c6c21107f1..d14fc35b39 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,6 +1,6 @@
 use pageserver_api::models::detach_ancestor::AncestorDetached;
 use pageserver_api::models::{
-    DetachBehavior, LocationConfig, LocationConfigListResponse, PageserverUtilization,
+    DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization,
     SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest,
     TenantShardSplitResponse, TenantWaitLsnRequest, TimelineArchivalConfigRequest,
     TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
@@ -10,6 +10,7 @@ use pageserver_client::BlockUnblock;
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
 
 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -195,6 +196,22 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_lease_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<LsnLease> {
+        measured_request!(
+            "timeline_lease_lsn",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_init_lsn_lease(tenant_shard_id, timeline_id, lsn)
+                .await
+        )
+    }
+
     pub(crate) async fn tenant_shard_split(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d3c8cad0bd..9f308d9a0b 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -12,7 +12,7 @@ use std::ops::{Deref, DerefMut};
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::{Duration, Instant, SystemTime};
 
 use anyhow::Context;
 use context_iterator::TenantShardContextIterator;
@@ -34,7 +34,7 @@ use pageserver_api::controller_api::{
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use pageserver_api::models::{
-    self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode,
+    self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
     PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig,
     TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
     TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
@@ -60,6 +60,7 @@ use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
 use utils::sync::gate::Gate;
 use utils::{failpoint_support, pausable_failpoint};
 
@@ -152,6 +153,7 @@ enum TenantOperations {
     TimelineGcBlockUnblock,
     DropDetached,
     DownloadHeatmapLayers,
+    TimelineLsnLease,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -3987,6 +3989,75 @@ impl Service {
         Ok(())
     }
 
+    pub(crate) async fn tenant_timeline_lsn_lease(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<LsnLease, ApiError> {
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineLsnLease,
+        )
+        .await;
+
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            // If the request got an unsharded tenant id, then apply
+            // the operation to all shards. Otherwise, apply it to a specific shard.
+            let shards_range = TenantShardId::tenant_range(tenant_id);
+
+            for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
+                if let Some(node_id) = shard.intent.get_attached() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        let res = self
+            .tenant_for_shards_api(
+                targets,
+                |tenant_shard_id, client| async move {
+                    client
+                        .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
+                        .await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        let mut valid_until = None;
+        for r in res {
+            match r {
+                Ok(lease) => {
+                    if let Some(ref mut valid_until) = valid_until {
+                        *valid_until = std::cmp::min(*valid_until, lease.valid_until);
+                    } else {
+                        valid_until = Some(lease.valid_until);
+                    }
+                }
+                Err(e) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                }
+            }
+        }
+        Ok(LsnLease {
+            valid_until: valid_until.unwrap_or_else(SystemTime::now),
+        })
+    }
+
     pub(crate) async fn tenant_timeline_download_heatmap_layers(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index a50a1beed6..a9df5f2d49 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -757,6 +757,31 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
     env.stop(immediate=True)
 
 
+def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder):
+    conf = {
+        "pitr_interval": "0s",
+        "gc_period": "0s",
+        "compaction_period": "0s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+    with env.endpoints.create_start(
+        "main",
+    ) as ep:
+        with ep.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        last_flush_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+    env.storage_controller.pageserver_api().timeline_lsn_lease(
+        env.initial_tenant, env.initial_timeline, last_flush_lsn
+    )
+    env.storage_controller.tenant_shard_split(env.initial_tenant, 8)
+    # TODO: do we preserve LSN leases across shard splits?
+    env.storage_controller.pageserver_api().timeline_lsn_lease(
+        env.initial_tenant, env.initial_timeline, last_flush_lsn
+    )
+
+
 def insert_with_action(
     env: NeonEnv,
     tenant: TenantId,

From cfe3e6d4e16d1e06c3376919d1380b0f3a3ca713 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 31 Mar 2025 22:49:32 +0300
Subject: [PATCH 005/140] Remove loop from pageserver_try_receive (#11387)

## Problem

Commit
https://github.com/neondatabase/neon/commit/3da70abfa5092bacdebed03d6f771dd28334d479
cause noticeable performance regression (40% in update-with-prefetch in
test_bulk_update):
https://neondb.slack.com/archives/C04BLQ4LW7K/p1742633167580879

## Summary of changes

Remove loop from pageserver_try_receive to make it fetch not more than
one response. There is still loop in `pump_prefetch_state` which can
fetch as many responses as available.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c                 | 26 ++++++------------------
 test_runner/regress/test_lfc_prefetch.py |  2 +-
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 20f4d462c0..11ef9af36b 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1142,37 +1142,23 @@ pageserver_try_receive(shardno_t shard_no)
 	NeonResponse *resp;
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
-	/* read response */
-	int			rc;
+	int	rc;
 
 	if (shard->state != PS_Connected)
 		return NULL;
 
 	Assert(pageserver_conn);
 
-	while (true)
+	rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */);
+	if (rc == 0)
 	{
-		if (PQisBusy(shard->conn))
+		if (!PQconsumeInput(shard->conn))
 		{
-			WaitEvent	event;
-			if (WaitEventSetWait(shard->wes_read, 0, &event, 1,
-								 WAIT_EVENT_NEON_PS_READ) != 1
-				|| (event.events & WL_SOCKET_READABLE) == 0)
-			{
-				return NULL;
-			}
+			return NULL;
 		}
 		rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */);
-		if (rc == 0)
-		{
-			if (!PQconsumeInput(shard->conn))
-			{
-				return NULL;
-			}
-		}
-		else
-			break;
 	}
+
 	if (rc == 0)
 		return NULL;
 	else if (rc > 0)
diff --git a/test_runner/regress/test_lfc_prefetch.py b/test_runner/regress/test_lfc_prefetch.py
index 27a5416eff..2885c0e17b 100644
--- a/test_runner/regress/test_lfc_prefetch.py
+++ b/test_runner/regress/test_lfc_prefetch.py
@@ -100,5 +100,5 @@ def test_lfc_prefetch(neon_simple_env: NeonEnv):
     prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
     log.info(f"Unused prefetches: {prefetch_expired}")
 
-    # No redundant prefethc requrests if prefetch results are stored in LFC
+    # No redundant prefetch requests if prefetch results are stored in LFC
     assert prefetch_expired == 0

From 557127550c291ebab338d5c8c723e5ae5c2bf7ff Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 1 Apr 2025 10:51:17 +0200
Subject: [PATCH 006/140] feat(compute): Add compute_ctl_up metric (#11376)

## Problem

For computes running inside NeonVM, the actual compute image tag is
buried inside the NeonVM spec, and we cannot get it as part of standard
k8s container metrics (it's always an image and a tag of the NeonVM
runner container). The workaround we currently use is to extract the
running computes info from the control plane database with SQL. It has
several drawbacks: i) it's complicated, separate DB per region; ii) it's
slow; iii) it's still an indirect source of info, i.e. k8s state could
be different from what the control plane expects.

## Summary of changes

Add a new `compute_ctl_up` gauge metric with `build_tag` and `status`
labels. It will help us to both overview what are the tags/versions of
all running computes; and to break them down by current status (`empty`,
`running`, `failed`, etc.)

Later, we could introduce low cardinality (no endpoint or compute ids)
streaming aggregates for such metrics, so they will be blazingly fast
and usable for monitoring the fleet-wide state.
---
 compute_tools/src/bin/compute_ctl.rs          | 21 ++++-------
 compute_tools/src/compute.rs                  | 37 ++++++++++++++-----
 .../src/http/routes/extension_server.rs       |  4 +-
 compute_tools/src/metrics.rs                  | 16 +++++++-
 test_runner/fixtures/endpoint/http.py         |  3 ++
 test_runner/fixtures/neon_fixtures.py         |  6 +--
 .../regress/test_compute_reconfigure.py       | 19 ++++++++++
 7 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index fc7a3e2827..da11ac2860 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,9 @@ use anyhow::{Context, Result};
 use clap::Parser;
 use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::ComputeSpec;
-use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
+use compute_tools::compute::{
+    BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
+};
 use compute_tools::extension_server::get_pg_version_string;
 use compute_tools::logger::*;
 use compute_tools::params::*;
@@ -57,10 +59,6 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;
 
-// this is an arbitrary build tag. Fine as a default / for testing purposes
-// in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "latest";
-
 // Compatibility hack: if the control plane specified any remote-ext-config
 // use the default value for extension storage proxy gateway.
 // Remove this once the control plane is updated to pass the gateway URL
@@ -147,7 +145,7 @@ fn main() -> Result<()> {
         .build()?;
     let _rt_guard = runtime.enter();
 
-    let build_tag = runtime.block_on(init())?;
+    runtime.block_on(init())?;
 
     // enable core dumping for all child processes
     setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -174,8 +172,6 @@ fn main() -> Result<()> {
             cgroup: cli.cgroup,
             #[cfg(target_os = "linux")]
             vm_monitor_addr: cli.vm_monitor_addr,
-            build_tag,
-
             live_config_allowed: cli_spec.live_config_allowed,
         },
         cli_spec.spec,
@@ -189,7 +185,7 @@ fn main() -> Result<()> {
     deinit_and_exit(exit_code);
 }
 
-async fn init() -> Result<String> {
+async fn init() -> Result<()> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;
 
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -199,12 +195,9 @@ async fn init() -> Result<String> {
         }
     });
 
-    let build_tag = option_env!("BUILD_TAG")
-        .unwrap_or(BUILD_TAG_DEFAULT)
-        .to_string();
-    info!("build_tag: {build_tag}");
+    info!("compute build_tag: {}", &BUILD_TAG.to_string());
 
-    Ok(build_tag)
+    Ok(())
 }
 
 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 4126835c1a..f27bf164ae 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,6 +20,7 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
+use once_cell::sync::Lazy;
 use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
@@ -35,6 +36,7 @@ use crate::disk_quota::set_disk_quota;
 use crate::installed_extensions::get_installed_extensions;
 use crate::logger::startup_context_from_env;
 use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
+use crate::metrics::COMPUTE_CTL_UP;
 use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
 use crate::rsyslog::{
@@ -49,6 +51,17 @@ use crate::{config, extension_server, local_proxy};
 
 pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
 pub static PG_PID: AtomicU32 = AtomicU32::new(0);
+// This is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "latest";
+/// Build tag/version of the compute node binaries/image. It's tricky and ugly
+/// to pass it everywhere as a part of `ComputeNodeParams`, so we use a
+/// global static variable.
+pub static BUILD_TAG: Lazy<String> = Lazy::new(|| {
+    option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string()
+});
 
 /// Static configuration params that don't change after startup. These mostly
 /// come from the CLI args, or are derived from them.
@@ -72,7 +85,6 @@ pub struct ComputeNodeParams {
     pub pgdata: String,
     pub pgbin: String,
     pub pgversion: String,
-    pub build_tag: String,
 
     /// The port that the compute's external HTTP server listens on
     pub external_http_port: u16,
@@ -173,6 +185,11 @@ impl ComputeState {
         info!("Changing compute status from {} to {}", prev, status);
         self.status = status;
         state_changed.notify_all();
+
+        COMPUTE_CTL_UP.reset();
+        COMPUTE_CTL_UP
+            .with_label_values(&[&BUILD_TAG, format!("{}", status).as_str()])
+            .set(1);
     }
 
     pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) {
@@ -352,13 +369,19 @@ impl ComputeNode {
         }
         .launch(&this);
 
-        // The internal HTTP server could be launched later, but there isn't much
-        // sense in waiting.
+        // The internal HTTP server is needed for a further activation by control plane
+        // if compute was started for a pool, so we have to start server before hanging
+        // waiting for a spec.
         crate::http::server::Server::Internal {
             port: this.params.internal_http_port,
         }
         .launch(&this);
 
+        // HTTP server is running, so we can officially declare compute_ctl as 'up'
+        COMPUTE_CTL_UP
+            .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()])
+            .set(1);
+
         // If we got a spec from the CLI already, use that. Otherwise wait for the
         // control plane to pass it to us with a /configure HTTP request
         let pspec = if let Some(cli_spec) = cli_spec {
@@ -2032,12 +2055,8 @@ LIMIT 100",
 
         let mut download_tasks = Vec::new();
         for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(
-                library,
-                true,
-                &self.params.build_tag,
-                &self.params.pgversion,
-            )?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &BUILD_TAG, &self.params.pgversion)?;
             download_tasks.push(self.download_extension(ext_name, ext_path));
         }
         let results = join_all(download_tasks).await;
diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
index 563b73ae65..6508de6eee 100644
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -5,7 +5,7 @@ use axum::response::{IntoResponse, Response};
 use http::StatusCode;
 use serde::Deserialize;
 
-use crate::compute::ComputeNode;
+use crate::compute::{BUILD_TAG, ComputeNode};
 use crate::http::JsonResponse;
 use crate::http::extract::{Path, Query};
 
@@ -47,7 +47,7 @@ pub(in crate::http) async fn download_extension(
         remote_extensions.get_ext(
             &filename,
             ext_server_params.is_library,
-            &compute.params.build_tag,
+            &BUILD_TAG,
             &compute.params.pgversion,
         )
     };
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 4caa48307e..52f1795703 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -1,7 +1,8 @@
 use metrics::core::{AtomicF64, Collector, GenericGauge};
 use metrics::proto::MetricFamily;
 use metrics::{
-    IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
+    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec,
+    register_int_gauge_vec, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;
 
@@ -70,8 +71,19 @@ pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(
     .expect("failed to define a metric")
 });
 
+// Report that `compute_ctl` is up and what's the current compute status.
+pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "compute_ctl_up",
+        "Whether compute_ctl is running",
+        &["build_tag", "status"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub fn collect() -> Vec<MetricFamily> {
-    let mut metrics = INSTALLED_EXTENSIONS.collect();
+    let mut metrics = COMPUTE_CTL_UP.collect();
+    metrics.extend(INSTALLED_EXTENSIONS.collect());
     metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
     metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
     metrics.extend(DB_MIGRATION_FAILED.collect());
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 9b28246f58..4073ebc3b9 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -5,6 +5,8 @@ import urllib.parse
 import requests
 from requests.adapters import HTTPAdapter
 
+from fixtures.log_helper import log
+
 
 class EndpointHttpClient(requests.Session):
     def __init__(
@@ -51,6 +53,7 @@ class EndpointHttpClient(requests.Session):
     def metrics(self) -> str:
         res = self.get(f"http://localhost:{self.external_port}/metrics")
         res.raise_for_status()
+        log.debug("raw compute metrics: %s", res.text)
         return res.text
 
     # Current compute status.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d3cb35fe49..34a841f59f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4209,7 +4209,7 @@ class Endpoint(PgProtocol, LogUtils):
 
         # Write it back updated
         with open(config_path, "w") as file:
-            log.info(json.dumps(dict(data_dict, **kwargs)))
+            log.debug(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     def respec_deep(self, **kwargs: Any) -> None:
@@ -4226,7 +4226,7 @@ class Endpoint(PgProtocol, LogUtils):
         with open(config_path) as f:
             data_dict: dict[str, Any] = json.load(f)
 
-        log.info("Current compute spec: %s", json.dumps(data_dict, indent=4))
+        log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
 
         for key, value in kwargs.items():
             if isinstance(value, dict):
@@ -4238,7 +4238,7 @@ class Endpoint(PgProtocol, LogUtils):
                 data_dict[key] = value
 
         with open(config_path, "w") as file:
-            log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
+            log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
             json.dump(data_dict, file, indent=4)
 
     def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py
index ed453f3f8d..6396ba67a1 100644
--- a/test_runner/regress/test_compute_reconfigure.py
+++ b/test_runner/regress/test_compute_reconfigure.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import os
 from typing import TYPE_CHECKING
 
+from fixtures.metrics import parse_metrics
 from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
@@ -64,3 +66,20 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv):
         row = cursor.fetchone()
         assert row is not None
         assert row[0] == TEST_LOG_LINE_PREFIX
+
+    # Check that even after reconfigure and state transitions we still report
+    # only the current status.
+    client = endpoint.http_client()
+    raw_metrics = client.metrics()
+    metrics = parse_metrics(raw_metrics)
+    samples = metrics.query_all("compute_ctl_up")
+    assert len(samples) == 1
+    assert samples[0].value == 1
+    samples = metrics.query_all("compute_ctl_up", {"status": "running"})
+    assert len(samples) == 1
+    assert samples[0].value == 1
+    # Check that build tag is reported
+    build_tag = os.environ.get("BUILD_TAG", "latest")
+    samples = metrics.query_all("compute_ctl_up", {"build_tag": build_tag})
+    assert len(samples) == 1
+    assert samples[0].value == 1

From 225cabd84d51e57dc10bcd3867bc76b8525d6ff6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 1 Apr 2025 13:38:12 +0200
Subject: [PATCH 007/140] pageserver: update upload queue TODOs (#11377)

Update some upload queue TODOs, particularly to track
https://github.com/neondatabase/neon/issues/10283, which I won't get
around to.
---
 pageserver/src/tenant/remote_timeline_client.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 32c0571b97..579dbeb322 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1968,9 +1968,7 @@ impl RemoteTimelineClient {
     /// Pick next tasks from the queue, and start as many of them as possible without violating
     /// the ordering constraints.
     ///
-    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
-    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
-    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
+    /// The number of inprogress tasks is limited by `Self::inprogress_tasks`, see `next_ready`.
     fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
         while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
             debug!("starting op: {next_op}");
@@ -2218,6 +2216,11 @@ impl RemoteTimelineClient {
                     }
                     res
                 }
+                // TODO: this should wait for the deletion to be executed by the deletion queue.
+                // Otherwise, the deletion may race with an upload and wrongfully delete a newer
+                // file. Some of the above logic attempts to work around this, it should be replaced
+                // by the upload queue ordering guarantees (see `can_bypass`). See:
+                // <https://github.com/neondatabase/neon/issues/10283>.
                 UploadOp::Delete(delete) => {
                     if self.config.read().unwrap().block_deletions {
                         let mut queue_locked = self.upload_queue.lock().unwrap();

From 80596feeaa5fc50b2639b7e9a15d5196bad31d1b Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 1 Apr 2025 13:43:58 +0200
Subject: [PATCH 008/140] pageserver: invert `CompactFlags::NoYield` as
 `YieldForL0` (#11382)

## Problem

`CompactFlags::NoYield` was a bit inconvenient, since every caller
except for the background compaction loop should generally set it (e.g.
HTTP API calls, tests, etc). It was also inconsistent with
`CompactionOutcome::YieldForL0`.

## Summary of changes

Invert `CompactFlags::NoYield` as `CompactFlags::YieldForL0`. There
should be no behavioral changes.
---
 pageserver/src/http/routes.rs                |  2 -
 pageserver/src/tenant.rs                     | 61 ++++++--------------
 pageserver/src/tenant/timeline.rs            | 26 +++++----
 pageserver/src/tenant/timeline/compaction.rs | 56 ++++++++----------
 4 files changed, 59 insertions(+), 86 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5a13fb1387..2bedf9e11a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2256,7 +2256,6 @@ async fn timeline_compact_handler(
     let state = get_state(&request);
 
     let mut flags = EnumSet::empty();
-    flags |= CompactFlags::NoYield; // run compaction to completion
 
     if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
         flags |= CompactFlags::ForceL0Compaction;
@@ -2417,7 +2416,6 @@ async fn timeline_checkpoint_handler(
     let state = get_state(&request);
 
     let mut flags = EnumSet::empty();
-    flags |= CompactFlags::NoYield; // run compaction to completion
     if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
         flags |= CompactFlags::ForceL0Compaction;
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e7d8ed75ed..3ed4103792 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3080,6 +3080,7 @@ impl Tenant {
             let mut has_pending_l0 = false;
             for timeline in compact_l0 {
                 let ctx = &ctx.with_scope_timeline(&timeline);
+                // NB: don't set CompactFlags::YieldForL0, since this is an L0-only compaction pass.
                 let outcome = timeline
                     .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
                     .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
@@ -3097,14 +3098,9 @@ impl Tenant {
             }
         }
 
-        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated
-        // more L0 layers, they may also be compacted here.
-        //
-        // NB: image compaction may yield if there is pending L0 compaction.
-        //
-        // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a
-        // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`.
-        // We leave this for a later PR.
+        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated more
+        // L0 layers, they may also be compacted here. Image compaction will yield if there is
+        // pending L0 compaction on any tenant timeline.
         //
         // TODO: consider ordering timelines by some priority, e.g. time since last full compaction,
         // amount of L1 delta debt or garbage, offload-eligible timelines first, etc.
@@ -3115,8 +3111,14 @@ impl Tenant {
             }
             let ctx = &ctx.with_scope_timeline(&timeline);
 
+            // Yield for L0 if the separate L0 pass is enabled (otherwise there's no point).
+            let mut flags = EnumSet::default();
+            if self.get_compaction_l0_first() {
+                flags |= CompactFlags::YieldForL0;
+            }
+
             let mut outcome = timeline
-                .compact(cancel, EnumSet::default(), ctx)
+                .compact(cancel, flags, ctx)
                 .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
                 .await
                 .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
@@ -6516,11 +6518,7 @@ mod tests {
 
         tline.freeze_and_flush().await?;
         tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
             .await?;
 
         let mut writer = tline.writer().await;
@@ -6537,11 +6535,7 @@ mod tests {
 
         tline.freeze_and_flush().await?;
         tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
             .await?;
 
         let mut writer = tline.writer().await;
@@ -6558,11 +6552,7 @@ mod tests {
 
         tline.freeze_and_flush().await?;
         tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
             .await?;
 
         let mut writer = tline.writer().await;
@@ -6579,11 +6569,7 @@ mod tests {
 
         tline.freeze_and_flush().await?;
         tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
             .await?;
 
         assert_eq!(
@@ -6666,9 +6652,7 @@ mod tests {
             timeline.freeze_and_flush().await?;
             if compact {
                 // this requires timeline to be &Arc<Timeline>
-                timeline
-                    .compact(&cancel, CompactFlags::NoYield.into(), ctx)
-                    .await?;
+                timeline.compact(&cancel, EnumSet::default(), ctx).await?;
             }
 
             // this doesn't really need to use the timeline_id target, but it is closer to what it
@@ -6995,7 +6979,6 @@ mod tests {
         child_timeline.freeze_and_flush().await?;
         let mut flags = EnumSet::new();
         flags.insert(CompactFlags::ForceRepartition);
-        flags.insert(CompactFlags::NoYield);
         child_timeline
             .compact(&CancellationToken::new(), flags, &ctx)
             .await?;
@@ -7374,9 +7357,7 @@ mod tests {
 
             // Perform a cycle of flush, compact, and GC
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&cancel, CompactFlags::NoYield.into(), &ctx)
-                .await?;
+            tline.compact(&cancel, EnumSet::default(), &ctx).await?;
             tenant
                 .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
@@ -7705,7 +7686,6 @@ mod tests {
                             let mut flags = EnumSet::new();
                             flags.insert(CompactFlags::ForceImageLayerCreation);
                             flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::NoYield);
                             flags
                         } else {
                             EnumSet::empty()
@@ -7756,9 +7736,7 @@ mod tests {
         let before_num_l0_delta_files =
             tline.layers.read().await.layer_map()?.level0_deltas().len();
 
-        tline
-            .compact(&cancel, CompactFlags::NoYield.into(), &ctx)
-            .await?;
+        tline.compact(&cancel, EnumSet::default(), &ctx).await?;
 
         let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
 
@@ -7923,7 +7901,6 @@ mod tests {
                             let mut flags = EnumSet::new();
                             flags.insert(CompactFlags::ForceImageLayerCreation);
                             flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::NoYield);
                             flags
                         },
                         &ctx,
@@ -8386,7 +8363,6 @@ mod tests {
                     let mut flags = EnumSet::new();
                     flags.insert(CompactFlags::ForceImageLayerCreation);
                     flags.insert(CompactFlags::ForceRepartition);
-                    flags.insert(CompactFlags::NoYield);
                     flags
                 },
                 &ctx,
@@ -8454,7 +8430,6 @@ mod tests {
                     let mut flags = EnumSet::new();
                     flags.insert(CompactFlags::ForceImageLayerCreation);
                     flags.insert(CompactFlags::ForceRepartition);
-                    flags.insert(CompactFlags::NoYield);
                     flags
                 },
                 &ctx,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 75f9225302..7c9c9a45d4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -870,9 +870,14 @@ pub(crate) enum CompactFlags {
     OnlyL0Compaction,
     EnhancedGcBottomMostCompaction,
     DryRun,
-    /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting
-    /// compaction via HTTP API.
-    NoYield,
+    /// Makes image compaction yield if there's pending L0 compaction. This should always be used in
+    /// the background compaction task, since we want to aggressively compact down L0 to bound
+    /// read amplification.
+    ///
+    /// It only makes sense to use this when `compaction_l0_first` is enabled (such that we yield to
+    /// an L0 compaction pass), and without `OnlyL0Compaction` (L0 compaction shouldn't yield for L0
+    /// compaction).
+    YieldForL0,
 }
 
 #[serde_with::serde_as]
@@ -1891,18 +1896,19 @@ impl Timeline {
         // out by other background tasks (including image compaction). We request this via
         // `BackgroundLoopKind::L0Compaction`.
         //
-        // If this is a regular compaction pass, and L0-only compaction is enabled in the config,
-        // then we should yield for immediate L0 compaction if necessary while we're waiting for the
-        // background task semaphore. There's no point yielding otherwise, since we'd just end up
-        // right back here.
+        // Yield for pending L0 compaction while waiting for the semaphore.
         let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction);
         let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() {
             true => BackgroundLoopKind::L0Compaction,
             false => BackgroundLoopKind::Compaction,
         };
-        let yield_for_l0 = !is_l0_only
-            && self.get_compaction_l0_first()
-            && !options.flags.contains(CompactFlags::NoYield);
+        let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0);
+        if yield_for_l0 {
+            // If this is an L0 pass, it doesn't make sense to yield for L0.
+            debug_assert!(!is_l0_only, "YieldForL0 during L0 pass");
+            // If `compaction_l0_first` is disabled, there's no point yielding.
+            debug_assert!(self.get_compaction_l0_first(), "YieldForL0 without L0 pass");
+        }
 
         let acquire = async move {
             let guard = self.compaction_lock.lock().await;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 711501caa9..2276ed428b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -394,8 +394,8 @@ impl GcCompactionQueue {
                 if job.dry_run {
                     flags |= CompactFlags::DryRun;
                 }
-                if options.flags.contains(CompactFlags::NoYield) {
-                    flags |= CompactFlags::NoYield;
+                if options.flags.contains(CompactFlags::YieldForL0) {
+                    flags |= CompactFlags::YieldForL0;
                 }
                 let options = CompactOptions {
                     flags,
@@ -983,7 +983,7 @@ impl Timeline {
 
         // Yield if we have pending L0 compaction. The scheduler will do another pass.
         if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0)
-            && !options.flags.contains(CompactFlags::NoYield)
+            && options.flags.contains(CompactFlags::YieldForL0)
         {
             info!("image/ancestor compaction yielding for L0 compaction");
             return Ok(CompactionOutcome::YieldForL0);
@@ -1028,7 +1028,7 @@ impl Timeline {
                             .load()
                             .as_ref()
                             .clone(),
-                        !options.flags.contains(CompactFlags::NoYield),
+                        options.flags.contains(CompactFlags::YieldForL0),
                     )
                     .await
                     .inspect_err(|err| {
@@ -2635,7 +2635,7 @@ impl Timeline {
     ) -> Result<CompactionOutcome, CompactionError> {
         let sub_compaction = options.sub_compaction;
         let job = GcCompactJob::from_compact_options(options.clone());
-        let no_yield = options.flags.contains(CompactFlags::NoYield);
+        let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0);
         if sub_compaction {
             info!(
                 "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
@@ -2650,7 +2650,7 @@ impl Timeline {
                     idx + 1,
                     jobs_len
                 );
-                self.compact_with_gc_inner(cancel, job, ctx, no_yield)
+                self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
                     .await?;
             }
             if jobs_len == 0 {
@@ -2658,7 +2658,8 @@ impl Timeline {
             }
             return Ok(CompactionOutcome::Done);
         }
-        self.compact_with_gc_inner(cancel, job, ctx, no_yield).await
+        self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
+            .await
     }
 
     async fn compact_with_gc_inner(
@@ -2666,7 +2667,7 @@ impl Timeline {
         cancel: &CancellationToken,
         job: GcCompactJob,
         ctx: &RequestContext,
-        no_yield: bool,
+        yield_for_l0: bool,
     ) -> Result<CompactionOutcome, CompactionError> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
@@ -2936,18 +2937,15 @@ impl Timeline {
             if cancel.is_cancelled() {
                 return Err(CompactionError::ShuttingDown);
             }
-            if !no_yield {
-                let should_yield = self
+            let should_yield = yield_for_l0
+                && self
                     .l0_compaction_trigger
                     .notified()
                     .now_or_never()
                     .is_some();
-                if should_yield {
-                    tracing::info!(
-                        "preempt gc-compaction when downloading layers: too many L0 layers"
-                    );
-                    return Ok(CompactionOutcome::YieldForL0);
-                }
+            if should_yield {
+                tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
             }
             let resident_layer = layer
                 .download_and_keep_resident(ctx)
@@ -3081,21 +3079,17 @@ impl Timeline {
                 return Err(CompactionError::ShuttingDown);
             }
 
-            if !no_yield {
-                keys_processed += 1;
-                if keys_processed % 1000 == 0 {
-                    let should_yield = self
-                        .l0_compaction_trigger
-                        .notified()
-                        .now_or_never()
-                        .is_some();
-                    if should_yield {
-                        tracing::info!(
-                            "preempt gc-compaction in the main loop: too many L0 layers"
-                        );
-                        return Ok(CompactionOutcome::YieldForL0);
-                    }
-                }
+            keys_processed += 1;
+            let should_yield = yield_for_l0
+                && keys_processed % 1000 == 0
+                && self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some();
+            if should_yield {
+                tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
             }
             if self.shard_identity.is_key_disposable(&key) {
                 // If this shard does not need to store this key, simply skip it.

From 016068b966a5e45a24c18abafd7aa748775aab1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 1 Apr 2025 14:39:10 +0200
Subject: [PATCH 009/140] API spec: add safekeepers field returned by storcon
 (#11385)

Add an optional `safekeepers` field to `TimelineInfo` which is returned
by the storcon upon timeline creation if the
`--timelines-onto-safekeepers` flag is enabled. It contains the list of
safekeepers chosen.

Other contexts where we return `TimelineInfo` do not contain the
`safekeepers` field, sadly I couldn't make this more type safe like done
in Rust via `TimelineCreateResponseStorcon`, as there is no way of
flattening or inheritance (and I don't that duplicating the entire type
for some minor type safety improvements is worth it).

The storcon side has been done in #11058.

Part of https://github.com/neondatabase/cloud/issues/16176
cc https://github.com/neondatabase/cloud/issues/16796
---
 pageserver/src/http/openapi_spec.yml | 34 ++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 8b839b454a..566086c527 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1133,6 +1133,40 @@ components:
         applied_gc_cutoff_lsn:
           type: string
           format: hex
+        safekeepers:
+          $ref: "#/components/schemas/TimelineSafekeepersInfo"
+
+    TimelineSafekeepersInfo:
+      type: object
+      required:
+        - tenant_id
+        - timeline_id
+        - generation
+        - safekeepers
+      properties:
+        tenant_id:
+          type: string
+          format: hex
+        timeline_id:
+          type: string
+          format: hex
+        generation:
+          type: integer
+        safekeepers:
+          type: array
+          items:
+            $ref: "#/components/schemas/TimelineSafekeeperInfo"
+
+    TimelineSafekeeperInfo:
+      type: object
+      required:
+        - id
+        - hostname
+      properties:
+        id:
+          type: integer
+        hostname:
+          type: string
 
     SyntheticSizeResponse:
       type: object

From 1fad1abb24bd77eda8aba7c137728a7c7e4c8205 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 1 Apr 2025 17:16:33 +0200
Subject: [PATCH 010/140] storcon: timeline deletion improvements and fixes
 (#11334)

This PR contains a bunch of smaller followups and fixes of the original
PR #11058. most of these implement suggestions from Arseny:

* remove `Queryable, Selectable` from `TimelinePersistence`: they are
not needed.
* no `Arc` around `CancellationToken`: it itself is an arc wrapper
* only schedule deletes instead of scheduling excludes and deletes
* persist and delete deletion ops
* delete rows in timelines table upon tenant and timeline deletion
* set `deleted_at` for timelines we are deleting before we start any
reconciles: this flag will help us later to recognize half-executed
deletions, or when we crashed before we could remove the timeline row
but after we removed the last pending op (handling these situations are
left for later).

Part of #9011
---
 storage_controller/src/persistence.rs         |  89 ++++++++++++++-
 .../src/service/safekeeper_reconciler.rs      | 103 ++++++++++++++----
 .../src/service/safekeeper_service.rs         |  67 ++++++++----
 3 files changed, 215 insertions(+), 44 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index c927b7c366..99b1a1e887 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1369,6 +1369,65 @@ impl Persistence {
         Ok(timeline_from_db)
     }
 
+    /// Set `delete_at` for the given timeline
+    pub(crate) async fn timeline_set_deleted_at(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> DatabaseResult<()> {
+        use crate::schema::timelines;
+
+        let deletion_time = chrono::Local::now().to_utc();
+        self.with_measured_conn(DatabaseOperation::InsertTimeline, move |conn| {
+            Box::pin(async move {
+                let updated = diesel::update(timelines::table)
+                    .filter(timelines::tenant_id.eq(tenant_id.to_string()))
+                    .filter(timelines::timeline_id.eq(timeline_id.to_string()))
+                    .set(timelines::deleted_at.eq(Some(deletion_time)))
+                    .execute(conn)
+                    .await?;
+
+                match updated {
+                    0 => Ok(()),
+                    1 => Ok(()),
+                    _ => Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({})",
+                        updated
+                    ))),
+                }
+            })
+        })
+        .await
+    }
+
+    /// Load timeline from db. Returns `None` if not present.
+    ///
+    /// Only works if `deleted_at` is set, so you should call [`Self::timeline_set_deleted_at`] before.
+    pub(crate) async fn delete_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> DatabaseResult<()> {
+        use crate::schema::timelines::dsl;
+
+        let tenant_id = &tenant_id;
+        let timeline_id = &timeline_id;
+        self.with_measured_conn(DatabaseOperation::GetTimeline, move |conn| {
+            Box::pin(async move {
+                diesel::delete(dsl::timelines)
+                    .filter(dsl::tenant_id.eq(&tenant_id.to_string()))
+                    .filter(dsl::timeline_id.eq(&timeline_id.to_string()))
+                    .filter(dsl::deleted_at.is_not_null())
+                    .execute(conn)
+                    .await?;
+                Ok(())
+            })
+        })
+        .await?;
+
+        Ok(())
+    }
+
     /// Loads a list of all timelines from database.
     pub(crate) async fn list_timelines_for_tenant(
         &self,
@@ -1491,6 +1550,34 @@ impl Persistence {
 
         Ok(timeline_from_db)
     }
+    /// List pending operations for a given timeline (including tenant-global ones)
+    pub(crate) async fn list_pending_ops_for_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> DatabaseResult<Vec<TimelinePendingOpPersistence>> {
+        use crate::schema::safekeeper_timeline_pending_ops::dsl;
+
+        let timelines_from_db = self
+            .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| {
+                Box::pin(async move {
+                    let from_db: Vec<TimelinePendingOpPersistence> =
+                        dsl::safekeeper_timeline_pending_ops
+                            .filter(dsl::tenant_id.eq(tenant_id.to_string()))
+                            .filter(
+                                dsl::timeline_id
+                                    .eq(timeline_id.to_string())
+                                    .or(dsl::timeline_id.eq("")),
+                            )
+                            .load(conn)
+                            .await?;
+                    Ok(from_db)
+                })
+            })
+            .await?;
+
+        Ok(timelines_from_db)
+    }
 
     /// Delete all pending ops for the given timeline.
     ///
@@ -1974,7 +2061,7 @@ impl ToSql<crate::schema::sql_types::PgLsn, Pg> for LsnWrapper {
     }
 }
 
-#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)]
+#[derive(Insertable, AsChangeset, Clone)]
 #[diesel(table_name = crate::schema::timelines)]
 pub(crate) struct TimelinePersistence {
     pub(crate) tenant_id: String,
diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs
index a60aa6ca53..8e752a8ff1 100644
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -160,9 +160,8 @@ pub(crate) struct ScheduleRequest {
 }
 
 struct ReconcilerHandle {
-    tx: UnboundedSender<(ScheduleRequest, Arc<CancellationToken>)>,
-    #[allow(clippy::type_complexity)]
-    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), Arc<CancellationToken>>>,
+    tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
+    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
     cancel: CancellationToken,
 }
 
@@ -172,13 +171,13 @@ impl ReconcilerHandle {
         &self,
         tenant_id: TenantId,
         timeline_id: Option<TimelineId>,
-    ) -> Arc<CancellationToken> {
+    ) -> CancellationToken {
         let entry = self.ongoing_tokens.entry((tenant_id, timeline_id));
         if let Entry::Occupied(entry) = &entry {
             let cancel: &CancellationToken = entry.get();
             cancel.cancel();
         }
-        entry.insert(Arc::new(self.cancel.child_token())).clone()
+        entry.insert(self.cancel.child_token()).clone()
     }
     /// Cancel an ongoing reconciliation
     fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option<TimelineId>) {
@@ -197,7 +196,7 @@ impl ReconcilerHandle {
 
 pub(crate) struct SafekeeperReconciler {
     service: Arc<Service>,
-    rx: UnboundedReceiver<(ScheduleRequest, Arc<CancellationToken>)>,
+    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
     cancel: CancellationToken,
 }
 
@@ -243,7 +242,7 @@ impl SafekeeperReconciler {
                 .await;
         }
     }
-    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: Arc<CancellationToken>) {
+    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
         let req_host = req.safekeeper.skp.host.clone();
         match req.kind {
             SafekeeperTimelineOpKind::Pull => {
@@ -300,36 +299,96 @@ impl SafekeeperReconciler {
             SafekeeperTimelineOpKind::Delete => {
                 let tenant_id = req.tenant_id;
                 if let Some(timeline_id) = req.timeline_id {
-                    self.reconcile_inner(
+                    let deleted = self.reconcile_inner(
                         req,
                         async |client| client.delete_timeline(tenant_id, timeline_id).await,
                         |_resp| {
-                            tracing::info!("deleted timeline from {req_host}");
+                            tracing::info!(%tenant_id, %timeline_id, "deleted timeline from {req_host}");
                         },
                         req_cancel,
                     )
                     .await;
+                    if deleted {
+                        self.delete_timeline_from_db(tenant_id, timeline_id).await;
+                    }
                 } else {
-                    self.reconcile_inner(
-                        req,
-                        async |client| client.delete_tenant(tenant_id).await,
-                        |_resp| {
-                            tracing::info!("deleted tenant from {req_host}");
-                        },
-                        req_cancel,
-                    )
-                    .await;
+                    let deleted = self
+                        .reconcile_inner(
+                            req,
+                            async |client| client.delete_tenant(tenant_id).await,
+                            |_resp| {
+                                tracing::info!(%tenant_id, "deleted tenant from {req_host}");
+                            },
+                            req_cancel,
+                        )
+                        .await;
+                    if deleted {
+                        self.delete_tenant_timelines_from_db(tenant_id).await;
+                    }
                 }
             }
         }
     }
+    async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) {
+        match self
+            .service
+            .persistence
+            .list_pending_ops_for_timeline(tenant_id, timeline_id)
+            .await
+        {
+            Ok(list) => {
+                if !list.is_empty() {
+                    tracing::info!(%tenant_id, %timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len());
+                    return;
+                }
+            }
+            Err(e) => {
+                tracing::warn!(%tenant_id, %timeline_id, "couldn't query pending ops: {e}");
+                return;
+            }
+        }
+        tracing::info!(%tenant_id, %timeline_id, "deleting timeline from db after all reconciles succeeded");
+        // In theory we could crash right after deleting the op from the db and right before reaching this,
+        // but then we'll boot up with a timeline that has deleted_at set, so hopefully we'll issue deletion ops for it again.
+        if let Err(err) = self
+            .service
+            .persistence
+            .delete_timeline(tenant_id, timeline_id)
+            .await
+        {
+            tracing::warn!(%tenant_id, %timeline_id, "couldn't delete timeline from db: {err}");
+        }
+    }
+    async fn delete_tenant_timelines_from_db(&self, tenant_id: TenantId) {
+        let timeline_list = match self
+            .service
+            .persistence
+            .list_timelines_for_tenant(tenant_id)
+            .await
+        {
+            Ok(timeline_list) => timeline_list,
+            Err(e) => {
+                tracing::warn!(%tenant_id, "couldn't query timelines: {e}");
+                return;
+            }
+        };
+        for timeline in timeline_list {
+            let Ok(timeline_id) = TimelineId::from_str(&timeline.timeline_id) else {
+                tracing::warn!("Invalid timeline ID in database {}", timeline.timeline_id);
+                continue;
+            };
+            self.delete_timeline_from_db(tenant_id, timeline_id).await;
+        }
+    }
+    /// Returns whether the reconciliation happened successfully
     async fn reconcile_inner<T, F, U>(
         &self,
         req: ScheduleRequest,
         closure: impl Fn(SafekeeperClient) -> F,
         log_success: impl FnOnce(T) -> U,
-        req_cancel: Arc<CancellationToken>,
-    ) where
+        req_cancel: CancellationToken,
+    ) -> bool
+    where
         F: Future<Output = Result<T, safekeeper_client::mgmt_api::Error>>,
     {
         let jwt = self
@@ -373,11 +432,11 @@ impl SafekeeperReconciler {
                             req.safekeeper.skp.host
                         );
                     }
-                    return;
+                    return true;
                 }
                 Err(mgmt_api::Error::Cancelled) => {
                     // On cancellation, the code that issued it will take care of removing db entries (if needed)
-                    return;
+                    return false;
                 }
                 Err(e) => {
                     tracing::info!(
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 557c684f6b..7f2c63b9af 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
@@ -313,25 +313,32 @@ impl Service {
             );
             return Ok(());
         };
+        self.persistence
+            .timeline_set_deleted_at(tenant_id, timeline_id)
+            .await?;
         let all_sks = tl
             .new_sk_set
             .iter()
-            .flat_map(|sks| {
-                sks.iter()
-                    .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude))
-            })
-            .chain(
-                tl.sk_set
-                    .iter()
-                    .map(|v| (*v, SafekeeperTimelineOpKind::Delete)),
-            )
-            .collect::<HashMap<_, _>>();
+            .flatten()
+            .chain(tl.sk_set.iter())
+            .collect::<HashSet<_>>();
 
         // Schedule reconciliations
+        for &sk_id in all_sks.iter() {
+            let pending_op = TimelinePendingOpPersistence {
+                tenant_id: tenant_id.to_string(),
+                timeline_id: timeline_id.to_string(),
+                generation: tl.generation,
+                op_kind: SafekeeperTimelineOpKind::Delete,
+                sk_id: *sk_id,
+            };
+            tracing::info!("writing pending op for sk id {sk_id}");
+            self.persistence.insert_pending_op(pending_op).await?;
+        }
         {
             let mut locked = self.inner.write().unwrap();
-            for (sk_id, kind) in all_sks {
-                let sk_id = NodeId(sk_id as u64);
+            for sk_id in all_sks {
+                let sk_id = NodeId(*sk_id as u64);
                 let Some(sk) = locked.safekeepers.get(&sk_id) else {
                     return Err(ApiError::InternalServerError(anyhow::anyhow!(
                         "Couldn't find safekeeper with id {sk_id}"
@@ -345,7 +352,7 @@ impl Service {
                     tenant_id,
                     timeline_id: Some(timeline_id),
                     generation: tl.generation as u32,
-                    kind,
+                    kind: SafekeeperTimelineOpKind::Delete,
                 };
                 locked.safekeeper_reconcilers.schedule_request(self, req);
             }
@@ -379,32 +386,50 @@ impl Service {
             })
             .collect::<Result<Vec<_>, ApiError>>()?;
 
-        // Remove pending ops from db.
+        // Remove pending ops from db, and set `deleted_at`.
         // We cancel them in a later iteration once we hold the state lock.
         for (timeline_id, _timeline) in timeline_list.iter() {
             self.persistence
                 .remove_pending_ops_for_timeline(tenant_id, Some(*timeline_id))
                 .await?;
+            self.persistence
+                .timeline_set_deleted_at(tenant_id, *timeline_id)
+                .await?;
         }
 
-        let mut locked = self.inner.write().unwrap();
-
         // The list of safekeepers that have any of the timelines
         let mut sk_list = HashSet::new();
 
         // List all pending ops for all timelines, cancel them
-        for (timeline_id, timeline) in timeline_list.iter() {
+        for (_timeline_id, timeline) in timeline_list.iter() {
             let sk_iter = timeline
                 .sk_set
                 .iter()
                 .chain(timeline.new_sk_set.iter().flatten())
                 .map(|id| NodeId(*id as u64));
-            for sk_id in sk_iter.clone() {
+            sk_list.extend(sk_iter);
+        }
+
+        for &sk_id in sk_list.iter() {
+            let pending_op = TimelinePendingOpPersistence {
+                tenant_id: tenant_id.to_string(),
+                timeline_id: String::new(),
+                generation: i32::MAX,
+                op_kind: SafekeeperTimelineOpKind::Delete,
+                sk_id: sk_id.0 as i64,
+            };
+            tracing::info!("writing pending op for sk id {sk_id}");
+            self.persistence.insert_pending_op(pending_op).await?;
+        }
+
+        let mut locked = self.inner.write().unwrap();
+
+        for (timeline_id, _timeline) in timeline_list.iter() {
+            for sk_id in sk_list.iter() {
                 locked
                     .safekeeper_reconcilers
-                    .cancel_reconciles_for_timeline(sk_id, tenant_id, Some(*timeline_id));
+                    .cancel_reconciles_for_timeline(*sk_id, tenant_id, Some(*timeline_id));
             }
-            sk_list.extend(sk_iter);
         }
 
         // unwrap is safe: we return above for an empty timeline list

From 02936b82c5c342c07be7f1573da4d404bbe4fa90 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 1 Apr 2025 18:48:02 +0300
Subject: [PATCH 011/140] Fix effective_lsn calculation for prefetch (#11219)

## Problem

See  https://neondb.slack.com/archives/C04DGM6SMTM/p1741594233757489

Consider the following scenario:

1. Backend A wants to prefetch some block B
2. Backend A checks that block B is not present in shared buffer
3. Backend A registers new prefetch request and calls
prefetch_do_request
4. prefetch_do_request calls neon_get_request_lsns
5. neon_get_request_lsns obtains LwLSN for block B
6. Backend B downloads B, updates and wallogs it (let say to Lsn1)
7. Block B is once again thrown from shared buffers, its LwLSN is set to
Lsn1
8. Backend A obtains current flush LSN, let's say that it is Lsn1
9. Backend A stores Lsn1 as effective_lsn in prefetch slot.
10. Backend A reads page B with LwLSN=Lsn1
11. Backend A finds in prefetch ring response for prefetch request for
block B with effective_lsn=Lsn1, so that it satisfies
neon_prefetch_response_usable condition
12. Backend A uses deteriorated version of the page!

## Summary of changes

Use `not_modified_since` as `effective_lsn`.
It should not cause some degrade of performance because we store LwLSN
when it was not found in LwLSN hash, so if page is not changed till
prefetch response is arrived, then LwLSN should not be changed.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 43 +++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 2424a5fcb6..0baa23cc30 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -2384,7 +2384,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 						 LSN_FORMAT_ARGS(last_written_lsn),
 						 LSN_FORMAT_ARGS(flushlsn));
 				XLogFlush(last_written_lsn);
-				flushlsn = last_written_lsn;
 			}
 
 			/*
@@ -2400,18 +2399,35 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 			 * requesting the latest page, by setting request LSN to
 			 * UINT64_MAX.
 			 *
-			 * Remember the current LSN, however, so that we can later
-			 * correctly determine if the response to the request is still
-			 * valid. The most up-to-date LSN we could use for that purpose
-			 * would be the current insert LSN, but to avoid the overhead of
-			 * looking it up, use 'flushlsn' instead. This relies on the
-			 * assumption that if the page was modified since the last WAL
-			 * flush, it should still be in the buffer cache, and we
-			 * wouldn't be requesting it.
+			 * effective_request_lsn is used to check that received response is still valid.
+			 * In case of primary node it is last written LSN. Originally we used flush_lsn here,
+			 * but it is not correct. Consider the following scenario:
+			 * 1. Backend A wants to prefetch block X
+			 * 2. Backend A checks that block X is not present in the shared buffer cache
+			 * 3. Backend A calls prefetch_do_request, which calls neon_get_request_lsns
+			 * 4. neon_get_request_lsns obtains LwLSN=11 for the block
+			 * 5. Backend B downloads block X, updates and wallogs it with LSN=13
+			 * 6. Block X is once again evicted from shared buffers, its LwLSN is set to LSN=13
+			 * 7. Backend A is still executing in neon_get_request_lsns(). It calls 'flushlsn = GetFlushRecPtr();'.
+			 *    Let's say that it is LSN=14
+			 * 8. Backend A uses LSN=14 as effective_lsn in the prefetch slot. The request stored in the slot is
+			 *    [not_modified_since=11, effective_request_lsn=14]
+			 * 9. Backend A sends the prefetch request, pageserver processes it, and sends response.
+			 *    The last LSN that the pageserver had processed was LSN=12, so the page image in the response is valid at LSN=12.
+			 * 10. Backend A calls smgrread() for page X with LwLSN=13
+			 * 11. Backend A finds in prefetch ring the response for the prefetch request with [not_modified_since=11, effective_lsn=Lsn14],
+			 * so it satisfies neon_prefetch_response_usable condition.
+			 *
+			 * Things go wrong in step 7-8, when [not_modified_since=11, effective_request_lsn=14] is determined for the request.
+			 * That is incorrect, because the page has in fact been modified at LSN=13. The invariant is that for any request,
+			 * there should not be any modifications to a page between its not_modified_since and (effective_)request_lsn values.
+			 *
+			 * The problem can be fixed by callingGetFlushRecPtr() before checking if the page is in the buffer cache.
+			 * But you can't do that within smgrprefetch(), would need to modify the caller.
 			 */
 			result->request_lsn = UINT64_MAX;
 			result->not_modified_since = last_written_lsn;
-			result->effective_request_lsn = flushlsn;
+			result->effective_request_lsn = last_written_lsn;
 		}
 	}
 }
@@ -2470,11 +2486,8 @@ neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
 	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
 	 * we remember `effective_request_lsn` separately. In a primary,
-	 * `effective_request_lsn` is the last flush WAL position when the request
-	 * was sent to the pageserver. That's logically the LSN that we are
-	 * requesting the page at, but we send UINT64_MAX to the pageserver so
-	 * that if the GC horizon advances past that position, we still get a
-	 * valid response instead of an error.
+	 * `effective_request_lsn` is the same as  `not_modified_since`.
+	 * See comments in neon_get_request_lsns why we can not use last flush WAL position here.
 	 *
 	 * To determine whether a response to a GetPage request issued earlier is
 	 * still valid to satisfy a new page read, we look at the

From c4fc602115b15fcd2a684ba911fab8bb92035afe Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 1 Apr 2025 14:50:58 -0400
Subject: [PATCH 012/140] feat(pageserver): support synthetic size calculation
 for invisible branches (#11335)

## Problem

ref https://github.com/neondatabase/neon/issues/11279


Imagine we have a branch with 3 snapshots A, B, and C:
```
base---+---+---+---main
        \-A \-B \-C
base=100G, base-A=1G, A-B=1G, B-C=1G, C-main=1G
```
at this point, the synthetic size should be 100+1+1+1+1=104G.

after the deletion, the structure looks like:
```
base---+---+---+
       \-A \-B \-C
```
If we simply assume main never exists, the size will be calculated as
size(A) + size(B) + size(C)=300GB, which obviously is not what the user
would expect.

The correct way to do this is to assume part of main still exists, that
is to say, set C-main=1G:
```
base---+---+---+main
       \-A \-B \-C
```
And we will get the correct synthetic size of 100G+1+1+1=103G.


## Summary of changes

* Do not generate gc cutoff point for invisible branches.
* Use the same LSN as the last branchpoint for branch end.
* Remove test_api_handler for mark_invisible.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs     |   2 +-
 pageserver/src/tenant.rs          | 251 ++++++++++++++++++++++++++++++
 pageserver/src/tenant/size.rs     |  25 ++-
 pageserver/src/tenant/timeline.rs |   4 +
 4 files changed, 277 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2bedf9e11a..3f36ff9904 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3774,7 +3774,7 @@ pub fn make_router(
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/mark_invisible",
-            |r| testing_api_handler("mark timeline invisible", r, timeline_mark_invisible_handler),
+            |r| api_handler( r, timeline_mark_invisible_handler),
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3ed4103792..f1dbb274b9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11526,4 +11526,255 @@ mod tests {
 
         Ok(())
     }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
+        use pageserver_api::models::TimelineVisibilityState;
+
+        use crate::tenant::size::gather_inputs;
+
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            // Ensure that we don't compute gc_cutoffs (which needs reading the layer files)
+            pitr_interval: Some(Duration::ZERO),
+            ..Default::default()
+        };
+        let harness = TenantHarness::create_custom(
+            "test_synthetic_size_calculation_with_invisible_branches",
+            tenant_conf,
+            TenantId::generate(),
+            ShardIdentity::unsharded(),
+            Generation::new(0xdeadbeef),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let main_tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![],
+                vec![],
+                vec![],
+                Lsn(0x100),
+            )
+            .await?;
+
+        let snapshot1 = TimelineId::from_array(hex!("11223344556677881122334455667790"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot1,
+                Some(Lsn(0x20)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let snapshot2 = TimelineId::from_array(hex!("11223344556677881122334455667791"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot2,
+                Some(Lsn(0x30)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let snapshot3 = TimelineId::from_array(hex!("11223344556677881122334455667792"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot3,
+                Some(Lsn(0x40)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let limit = Arc::new(Semaphore::new(1));
+        let max_retention_period = None;
+        let mut logical_size_cache = HashMap::new();
+        let cause = LogicalSizeCalculationCause::EvictionTaskImitation;
+        let cancel = CancellationToken::new();
+
+        let inputs = gather_inputs(
+            &tenant,
+            &limit,
+            max_retention_period,
+            &mut logical_size_cache,
+            cause,
+            &cancel,
+            &ctx,
+        )
+        .instrument(info_span!(
+            "gather_inputs",
+            tenant_id = "unknown",
+            shard_id = "unknown",
+        ))
+        .await?;
+        use crate::tenant::size::{LsnKind, ModelInputs, SegmentMeta};
+        use LsnKind::*;
+        use tenant_size_model::Segment;
+        let ModelInputs { mut segments, .. } = inputs;
+        segments.retain(|s| s.timeline_id == TIMELINE_ID);
+        for segment in segments.iter_mut() {
+            segment.segment.parent = None; // We don't care about the parent for the test
+            segment.segment.size = None; // We don't care about the size for the test
+        }
+        assert_eq!(
+            segments,
+            [
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x10,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchStart,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x20,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x30,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x100,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: GcCutOff,
+                }, // we need to retain everything above the last branch point
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x100,
+                        size: None,
+                        needed: true,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchEnd,
+                },
+            ]
+        );
+
+        main_tline
+            .remote_client
+            .schedule_index_upload_for_timeline_invisible_state(
+                TimelineVisibilityState::Invisible,
+            )?;
+        main_tline.remote_client.wait_completion().await?;
+        let inputs = gather_inputs(
+            &tenant,
+            &limit,
+            max_retention_period,
+            &mut logical_size_cache,
+            cause,
+            &cancel,
+            &ctx,
+        )
+        .instrument(info_span!(
+            "gather_inputs",
+            tenant_id = "unknown",
+            shard_id = "unknown",
+        ))
+        .await?;
+        let ModelInputs { mut segments, .. } = inputs;
+        segments.retain(|s| s.timeline_id == TIMELINE_ID);
+        for segment in segments.iter_mut() {
+            segment.segment.parent = None; // We don't care about the parent for the test
+            segment.segment.size = None; // We don't care about the size for the test
+        }
+        assert_eq!(
+            segments,
+            [
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x10,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchStart,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x20,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x30,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40, // Branch end LSN == last branch point LSN
+                        size: None,
+                        needed: true,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchEnd,
+                },
+            ]
+        );
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 8cc94b4e4d..c7ac50ca6a 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -33,7 +33,7 @@ pub struct ModelInputs {
 }
 
 /// A [`Segment`], with some extra information for display purposes
-#[derive(Debug, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
 pub struct SegmentMeta {
     pub segment: Segment,
     pub timeline_id: TimelineId,
@@ -248,6 +248,8 @@ pub(super) async fn gather_inputs(
             None
         };
 
+        let branch_is_invisible = timeline.is_invisible() == Some(true);
+
         let lease_points = gc_info
             .leases
             .keys()
@@ -271,7 +273,10 @@ pub(super) async fn gather_inputs(
             .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
-        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+        if !branch_is_invisible {
+            // Do not count lease points for invisible branches.
+            lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+        }
 
         drop(gc_info);
 
@@ -287,7 +292,9 @@ pub(super) async fn gather_inputs(
 
         // Add a point for the PITR cutoff
         let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
-        if !branch_start_needed {
+        if !branch_start_needed && !branch_is_invisible {
+            // Only add the GcCutOff point when the timeline is visible; otherwise, do not compute the size for the LSN
+            // range from the last branch point to the latest data.
             lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
         }
 
@@ -373,11 +380,19 @@ pub(super) async fn gather_inputs(
             }
         }
 
+        let branch_end_lsn = if branch_is_invisible {
+            // If the branch is invisible, the branch end is the last requested LSN (likely a branch cutoff point).
+            segments.last().unwrap().segment.lsn
+        } else {
+            // Otherwise, the branch end is the last record LSN.
+            last_record_lsn.0
+        };
+
         // Current end of the timeline
         segments.push(SegmentMeta {
             segment: Segment {
                 parent: Some(parent),
-                lsn: last_record_lsn.0,
+                lsn: branch_end_lsn,
                 size: None, // Filled in later, if necessary
                 needed: true,
             },
@@ -609,6 +624,7 @@ async fn calculate_logical_size(
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }
 
+#[cfg(test)]
 #[test]
 fn verify_size_for_multiple_branches() {
     // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
@@ -766,6 +782,7 @@ fn verify_size_for_multiple_branches() {
     assert_eq!(inputs.calculate(), 37_851_408);
 }
 
+#[cfg(test)]
 #[test]
 fn verify_size_for_one_branch() {
     let doc = r#"
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7c9c9a45d4..751a8acb89 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2215,6 +2215,10 @@ impl Timeline {
         self.remote_client.is_archived()
     }
 
+    pub(crate) fn is_invisible(&self) -> Option<bool> {
+        self.remote_client.is_invisible()
+    }
+
     pub(crate) fn is_stopping(&self) -> bool {
         self.current_state() == TimelineState::Stopping
     }

From 7dc83708488db45b533723deef1d5635caf9ee0e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 2 Apr 2025 11:08:05 +0100
Subject: [PATCH 013/140] storcon: do graceful migrations from chaos injector
 (#11028)

## Problem

Followup to https://github.com/neondatabase/neon/pull/10913

Existing chaos injection just does simple cutovers to secondary
locations. Let's also exercise code for doing graceful migrations. This
should implicitly test how such migrations cope with overlapping with
service restarts.

## Summary of changes
---
 .../src/service/chaos_injector.rs             | 114 +++++++++++++++---
 1 file changed, 98 insertions(+), 16 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index a0419e0205..9c7a9e3798 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -4,7 +4,7 @@ use std::time::Duration;
 
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
-use rand::thread_rng;
+use rand::{Rng, thread_rng};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;
 use utils::shard::TenantShardId;
@@ -64,17 +64,22 @@ impl ChaosInjector {
         let mut interval = tokio::time::interval(self.interval);
         #[derive(Debug)]
         enum ChaosEvent {
-            ShuffleTenant,
-            ForceKill,
+            MigrationsToSecondary,
+            ForceKillController,
+            GracefulMigrationsAnywhere,
         }
         loop {
             let cron_interval = self.get_cron_interval_sleep_future();
             let chaos_type = tokio::select! {
                 _ = interval.tick() => {
-                    ChaosEvent::ShuffleTenant
+                    if thread_rng().gen_bool(0.5) {
+                        ChaosEvent::MigrationsToSecondary
+                    } else {
+                        ChaosEvent::GracefulMigrationsAnywhere
+                    }
                 }
                 Some(_) = maybe_sleep(cron_interval) => {
-                    ChaosEvent::ForceKill
+                    ChaosEvent::ForceKillController
                 }
                 _ = cancel.cancelled() => {
                     tracing::info!("Shutting down");
@@ -83,16 +88,29 @@ impl ChaosInjector {
             };
             tracing::info!("Chaos iteration: {chaos_type:?}...");
             match chaos_type {
-                ChaosEvent::ShuffleTenant => {
-                    self.inject_chaos().await;
+                ChaosEvent::MigrationsToSecondary => {
+                    self.inject_migrations_to_secondary();
                 }
-                ChaosEvent::ForceKill => {
+                ChaosEvent::GracefulMigrationsAnywhere => {
+                    self.inject_graceful_migrations_anywhere();
+                }
+                ChaosEvent::ForceKillController => {
                     self.force_kill().await;
                 }
             }
         }
     }
 
+    fn is_shard_eligible_for_chaos(&self, shard: &TenantShard) -> bool {
+        // - Skip non-active scheduling policies, so that a shard with a policy like Pause can
+        //   be pinned without being disrupted by us.
+        // - Skip shards doing a graceful migration already, so that we allow these to run to
+        //   completion rather than only exercising the first part and then cancelling with
+        //   some other chaos.
+        !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active)
+            && shard.get_preferred_node().is_none()
+    }
+
     /// If a shard has a secondary and attached location, then re-assign the secondary to be
     /// attached and the attached to be secondary.
     ///
@@ -108,13 +126,7 @@ impl ChaosInjector {
             .get_mut(&tenant_shard_id)
             .expect("Held lock between choosing ID and this get");
 
-        if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
-            // Skip non-active scheduling policies, so that a shard with a policy like Pause can
-            // be pinned without being disrupted by us.
-            tracing::info!(
-                "Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
-                shard.get_scheduling_policy()
-            );
+        if !self.is_shard_eligible_for_chaos(shard) {
             return;
         }
 
@@ -152,7 +164,77 @@ impl ChaosInjector {
         std::process::exit(1);
     }
 
-    async fn inject_chaos(&mut self) {
+    // Unlike [`Self::inject_migrations_to_secondary`], this function will not only cut over to secondary, it
+    // will migrate a tenant to a random node in its home AZ using a graceful migration of the same type
+    // that my be initiated by an API caller using prewarm=true.
+    //
+    // This is a much more expensive operation in terms of I/O and time, as we will fully warm up
+    // some new location in order to migrate the tenant there.  For that reason we do far fewer of these.
+    fn inject_graceful_migrations_anywhere(&mut self) {
+        let batch_size = 1;
+        let mut inner = self.service.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = inner.parts_mut();
+
+        let mut candidates = tenants
+            .values_mut()
+            .filter(|shard| self.is_shard_eligible_for_chaos(shard))
+            .collect::<Vec<_>>();
+
+        tracing::info!(
+            "Injecting chaos: found {} candidates for graceful migrations anywhere",
+            candidates.len()
+        );
+
+        let mut victims: Vec<&mut TenantShard> = Vec::new();
+
+        // Pick our victims: use a hand-rolled loop rather than choose_multiple() because we want
+        // to take the mutable refs from our candidates rather than ref'ing them.
+        while !candidates.is_empty() && victims.len() < batch_size {
+            let i = thread_rng().gen_range(0..candidates.len());
+            victims.push(candidates.swap_remove(i));
+        }
+
+        for victim in victims.into_iter() {
+            // Find a node in the same AZ as the shard, or if the shard has no AZ preference, which
+            // is not where they are currently attached.
+            let candidate_nodes = nodes
+                .values()
+                .filter(|node| {
+                    if let Some(preferred_az) = victim.preferred_az() {
+                        node.get_availability_zone_id() == preferred_az
+                    } else if let Some(attached) = *victim.intent.get_attached() {
+                        node.get_id() != attached
+                    } else {
+                        true
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            let Some(victim_node) = candidate_nodes.choose(&mut thread_rng()) else {
+                // This can happen if e.g. we are in a small region with only one pageserver per AZ.
+                tracing::info!(
+                    "no candidate nodes found for migrating shard {tenant_shard_id} within its home AZ",
+                    tenant_shard_id = victim.tenant_shard_id
+                );
+                continue;
+            };
+
+            // This doesn't change intent immediately: next iteration of Service::optimize_all should do that.  We avoid
+            // doing it here because applying optimizations requires dropping lock to do some async work to check the optimisation
+            // is valid given remote state, and it would be a shame to duplicate that dance here.
+            tracing::info!(
+                "Injecting chaos: migrate {} to {}",
+                victim.tenant_shard_id,
+                victim_node
+            );
+            victim.set_preferred_node(Some(victim_node.get_id()));
+        }
+    }
+
+    /// Migrations of attached locations to their secondary location.  This exercises reconciliation in general,
+    /// live migration in particular, and the pageserver code for cleanly shutting down and starting up tenants
+    /// during such migrations.
+    fn inject_migrations_to_secondary(&mut self) {
         // Pick some shards to interfere with
         let batch_size = 128;
         let mut inner = self.service.inner.write().unwrap();

From 4bc6dbdd5f72a015d89c642368dffff5f8d4c8e6 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 2 Apr 2025 12:43:05 +0200
Subject: [PATCH 014/140] use a prod-like shared_buffers size for some perf
 unit tests (#11373)

## Problem

In Neon DBaaS we adjust the shared_buffers to the size of the compute,
or better described we adjust the max number of connections to the
compute size and we adjust the shared_buffers size to the number of max
connections according to about the following sizes
`2 CU: 225mb; 4 CU: 450mb; 8 CU: 900mb`

[see](https://github.com/neondatabase/cloud/blob/877e33b4289a471b8f0a35c84009846358f3e5a3/goapp/controlplane/internal/pkg/compute/computespec/pg_settings.go#L405)

## Summary of changes

We should run perf unit tests with settings that is realistic for a
paying customer and select 8 CU as the reference for those tests.
---
 test_runner/fixtures/utils.py                   | 17 +++++++++++++++++
 test_runner/performance/test_bulk_update.py     |  6 +++++-
 .../performance/test_ingest_insert_bulk.py      |  5 ++++-
 .../performance/test_ingest_logical_message.py  |  2 +-
 test_runner/performance/test_parallel_copy.py   |  7 ++++++-
 .../performance/test_perf_many_relations.py     |  4 +++-
 test_runner/regress/test_import_pgdata.py       |  4 +++-
 .../regress/test_pageserver_getpage_throttle.py |  1 +
 .../regress/test_physical_replication.py        |  4 +++-
 9 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 4ece6e89a8..13c2d320d1 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -724,3 +724,20 @@ def skip_on_ci(reason: str):
         os.getenv("CI", "false") == "true",
         reason=reason,
     )
+
+
+def shared_buffers_for_max_cu(max_cu: float) -> str:
+    """
+    Returns the string value of shared_buffers for the given max CU.
+    Use shared_buffers size like in production for max CU compute.
+    See https://github.com/neondatabase/cloud/blob/877e33b4289a471b8f0a35c84009846358f3e5a3/goapp/controlplane/internal/pkg/compute/computespec/pg_settings.go#L405
+
+    e.g. // 2 CU: 225mb; 4 CU: 450mb; 8 CU: 900mb
+    """
+    ramBytes = int(4096 * max_cu * 1024 * 1024)
+    maxConnections = max(100, min(int(ramBytes / 9531392), 4000))
+    maxWorkerProcesses = 12 + int(max_cu * 2)
+    maxBackends = 1 + maxConnections + maxWorkerProcesses
+    sharedBuffersMb = int(max(128, (1023 + maxBackends * 256) / 1024))
+    sharedBuffers = int(sharedBuffersMb * 1024 / 8)
+    return str(sharedBuffers)
diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py
index 6946bc66f2..16606268f4 100644
--- a/test_runner/performance/test_bulk_update.py
+++ b/test_runner/performance/test_bulk_update.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
+from fixtures.utils import shared_buffers_for_max_cu
 
 
 #
@@ -20,7 +21,10 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
 
     timeline_id = env.create_branch("test_bulk_update")
     tenant_id = env.initial_tenant
-    endpoint = env.endpoints.create_start("test_bulk_update")
+    # use shared_buffers size like in production for 8 CU compute
+    endpoint = env.endpoints.create_start(
+        "test_bulk_update", config_lines=[f"shared_buffers={shared_buffers_for_max_cu(8.0)}"]
+    )
     cur = endpoint.connect().cursor()
     cur.execute("set statement_timeout=0")
 
diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py
index 283bcada31..01836b82e9 100644
--- a/test_runner/performance/test_ingest_insert_bulk.py
+++ b/test_runner/performance/test_ingest_insert_bulk.py
@@ -17,9 +17,10 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import s3_storage
+from fixtures.utils import shared_buffers_for_max_cu
 
 
-@pytest.mark.timeout(900)
+@pytest.mark.timeout(1800)
 @pytest.mark.parametrize("size", [8, 1024, 8192])
 @pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"])
 @pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"])
@@ -60,6 +61,8 @@ def test_ingest_insert_bulk(
             f"fsync = {fsync}",
             "max_replication_apply_lag = 0",
             f"max_replication_flush_lag = {'10GB' if backpressure else '0'}",
+            # use shared_buffers size like in production for 8 CU compute
+            f"shared_buffers={shared_buffers_for_max_cu(8.0)}",
             # NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB.
             f"max_replication_write_lag = {'500MB' if backpressure else '0'}",
         ],
diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py
index b55cb68b64..bc16e3964d 100644
--- a/test_runner/performance/test_ingest_logical_message.py
+++ b/test_runner/performance/test_ingest_logical_message.py
@@ -12,7 +12,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_for_last_record_lsn
 
 
-@pytest.mark.timeout(600)
+@pytest.mark.timeout(1200)
 @pytest.mark.parametrize("size", [1024, 8192, 131072])
 @pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
 def test_ingest_logical_message(
diff --git a/test_runner/performance/test_parallel_copy.py b/test_runner/performance/test_parallel_copy.py
index f1d1c1904b..f7f20bd33e 100644
--- a/test_runner/performance/test_parallel_copy.py
+++ b/test_runner/performance/test_parallel_copy.py
@@ -7,6 +7,8 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import Endpoint, NeonEnv
 
+from fixtures.utils import shared_buffers_for_max_cu
+
 
 async def repeat_bytes(buf, repetitions: int):
     for _ in range(repetitions):
@@ -45,7 +47,10 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int):
 # Load data into one table with COPY TO from 5 parallel connections
 def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5):
     env = neon_simple_env
-    endpoint = env.endpoints.create_start("main")
+    # use shared_buffers size like in production for 8 CU compute
+    endpoint = env.endpoints.create_start(
+        "main", config_lines=[f"shared_buffers={shared_buffers_for_max_cu(8.0)}"]
+    )
 
     # Create test table
     conn = endpoint.connect()
diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py
index e2f0a79018..81dae53759 100644
--- a/test_runner/performance/test_perf_many_relations.py
+++ b/test_runner/performance/test_perf_many_relations.py
@@ -6,6 +6,7 @@ from fixtures.benchmark_fixture import NeonBenchmarker
 from fixtures.compare_fixtures import RemoteCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import shared_buffers_for_max_cu
 
 
 def get_num_relations(default: int = 1000) -> list[int]:
@@ -78,7 +79,8 @@ def test_perf_simple_many_relations_reldir_v2(
     ep = env.endpoints.create_start(
         "main",
         config_lines=[
-            "shared_buffers=1000MB",
+            # use shared_buffers size like in production for 8 CU compute
+            f"shared_buffers={shared_buffers_for_max_cu(8.0)}",
             "max_locks_per_transaction=16384",
         ],
     )
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index ca794f6685..6b3b71f29c 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -19,6 +19,7 @@ from fixtures.pageserver.http import (
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import MockS3Server, RemoteStorageKind
+from fixtures.utils import shared_buffers_for_max_cu
 from mypy_boto3_kms import KMSClient
 from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
 from mypy_boto3_s3 import S3Client
@@ -80,7 +81,8 @@ def test_pgdata_import_smoke(
     # doesn't allow any prefetching on v17 and above, where the new streaming
     # read machinery keeps buffers pinned while prefetching them.  Use a higher
     # setting to enable prefetching and speed up the tests
-    ep_config = ["shared_buffers=64MB"]
+    # use shared_buffers size like in production for 8 CU compute
+    ep_config = [f"shared_buffers={shared_buffers_for_max_cu(8.0)}"]
 
     #
     # Put data in vanilla pg
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index c5d6650ca8..e84876651c 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 
 
+@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
 def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env = neon_env_builder.init_start()
 
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index 17819fd367..1ebf70dbf2 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import wait_replica_caughtup
+from fixtures.utils import shared_buffers_for_max_cu
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv
@@ -180,7 +181,8 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en
         endpoint_id="primary",
         config_lines=[
             "max_connections=1000",
-            "shared_buffers=128MB",  # prevent "no unpinned buffers available" error
+            # use shared_buffers size like in production for 2 CU compute
+            f"shared_buffers={shared_buffers_for_max_cu(2.0)}",  # prevent "no unpinned buffers available" error
         ],
     )
     secondary = env.endpoints.new_replica_start(

From c179d098efb9521498a00001d16b8cc1555b1eff Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 2 Apr 2025 13:18:37 +0200
Subject: [PATCH 015/140] Increase the timeout for extensions upgrade tests (on
 schedule). (#11406)

## Problem
Sometimes the forced extension upgrade test fails (on schedule) due to a
timeout.
## Summary of changes
The timeout is increased to 60 mins.
---
 .github/workflows/force-test-extensions-upgrade.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml
index 896ec4a0c1..9c9357055d 100644
--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -55,7 +55,7 @@ jobs:
           echo tag=${tag} >> ${GITHUB_OUTPUT}
 
       - name: Test extension upgrade
-        timeout-minutes: 20
+        timeout-minutes: 60
         env:
           NEW_COMPUTE_TAG: latest
           OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}

From 47f5bcf2bcef94e6cdc9a9ec1b07f8bf94bf1c11 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 2 Apr 2025 14:55:15 +0200
Subject: [PATCH 016/140] pageserver: don't periodically flush layers for stale
 attachments (#11317)

## Problem

Tenants in attachment state `Stale` can't upload layers, and don't run
compaction, but still do periodic L0 layer flushes in the tenant
housekeeping loop. If the tenant remains stuck in stale mode, this
causes a large buildup of L0 layers, causing logging, metrics increases,
and possibly alerts.

Resolves #11245.

## Summary of changes

Don't perform periodic layer flushes in stale attachment state.
---
 pageserver/src/tenant.rs | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f1dbb274b9..15853133d6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3248,17 +3248,23 @@ impl Tenant {
     async fn housekeeping(&self) {
         // Call through to all timelines to freeze ephemeral layers as needed. This usually happens
         // during ingest, but we don't want idle timelines to hold open layers for too long.
-        let timelines = self
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tli| tli.is_active())
-            .cloned()
-            .collect_vec();
+        //
+        // We don't do this if the tenant can't upload layers (i.e. it's in stale attachment mode).
+        // We don't run compaction in this case either, and don't want to keep flushing tiny L0
+        // layers that won't be compacted down.
+        if self.tenant_conf.load().location.may_upload_layers_hint() {
+            let timelines = self
+                .timelines
+                .lock()
+                .unwrap()
+                .values()
+                .filter(|tli| tli.is_active())
+                .cloned()
+                .collect_vec();
 
-        for timeline in timelines {
-            timeline.maybe_freeze_ephemeral_layer().await;
+            for timeline in timelines {
+                timeline.maybe_freeze_ephemeral_layer().await;
+            }
         }
 
         // Shut down walredo if idle.

From 66678108002e50445c4222ce019e543e4a313dcd Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 2 Apr 2025 16:20:52 +0200
Subject: [PATCH 017/140] chore(compute_ctl): Minor code and comment fixes
 (#11411)

## Problem

In #11376 I mistakenly reworded one comment and also forgot to commit
one of the suggestions.

## Summary of changes

Fix it here.
---
 compute_tools/src/compute.rs | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index f27bf164ae..70b91c781a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -188,7 +188,7 @@ impl ComputeState {
 
         COMPUTE_CTL_UP.reset();
         COMPUTE_CTL_UP
-            .with_label_values(&[&BUILD_TAG, format!("{}", status).as_str()])
+            .with_label_values(&[&BUILD_TAG, status.to_string().as_str()])
             .set(1);
     }
 
@@ -360,6 +360,14 @@ impl ComputeNode {
             this.prewarm_postgres()?;
         }
 
+        // Set the up metric with Empty status before starting the HTTP server.
+        // That way on the first metric scrape, an external observer will see us
+        // as 'up' and 'empty' (unless the compute was started with a spec or
+        // already configured by control plane).
+        COMPUTE_CTL_UP
+            .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()])
+            .set(1);
+
         // Launch the external HTTP server first, so that we can serve control plane
         // requests while configuration is still in progress.
         crate::http::server::Server::External {
@@ -369,19 +377,13 @@ impl ComputeNode {
         }
         .launch(&this);
 
-        // The internal HTTP server is needed for a further activation by control plane
-        // if compute was started for a pool, so we have to start server before hanging
-        // waiting for a spec.
+        // The internal HTTP server could be launched later, but there isn't much
+        // sense in waiting.
         crate::http::server::Server::Internal {
             port: this.params.internal_http_port,
         }
         .launch(&this);
 
-        // HTTP server is running, so we can officially declare compute_ctl as 'up'
-        COMPUTE_CTL_UP
-            .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()])
-            .set(1);
-
         // If we got a spec from the CLI already, use that. Otherwise wait for the
         // control plane to pass it to us with a /configure HTTP request
         let pspec = if let Some(cli_spec) = cli_spec {

From 3c2bc5baba3c61c49abb46fe3fc91e0a174f9df6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 2 Apr 2025 16:32:53 +0200
Subject: [PATCH 018/140] fix(ci): run checks on release PRs (#11375)

## Problem
Hotfix releases mean that sometimes changes in release PRs haven't been
tested and linted yet. Disabling tests and lints is therefore not
necessarily safe. In the future we will check whether tests have run on
the same git tree already to speed things up, but for now we need to
turn tests back on fully. This partially reverts:
https://github.com/neondatabase/neon/pull/11272

## Summary of changes
Run checks on `.*-rc-pr` runs.
---
 .github/workflows/build_and_test.yml | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7d35066616..0ef9baf2f7 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -89,8 +89,8 @@ jobs:
 
   check-codestyle-python:
     needs: [ meta, check-permissions, build-build-tools-image ]
-    # No need to run on `main` because we this in the merge queue
-    if: ${{ needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
     uses: ./.github/workflows/_check-codestyle-python.yml
     with:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -98,7 +98,8 @@ jobs:
 
   check-codestyle-jsonnet:
     needs: [ meta, check-permissions, build-build-tools-image ]
-    if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
     runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -181,8 +182,8 @@ jobs:
 
   check-codestyle-rust:
     needs: [ meta, check-permissions, build-build-tools-image ]
-    # No need to run on `main` because we this in the merge queue
-    if: ${{ needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
     uses: ./.github/workflows/_check-codestyle-rust.yml
     with:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -191,7 +192,8 @@ jobs:
 
   check-dependencies-rust:
     needs: [ meta, files-changed, build-build-tools-image ]
-    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
     uses: ./.github/workflows/cargo-deny.yml
     with:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -199,7 +201,8 @@ jobs:
 
   build-and-test-locally:
     needs: [ meta, build-build-tools-image ]
-    if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
     strategy:
       fail-fast: false
       matrix:
@@ -1565,10 +1568,10 @@ jobs:
         if: |
           contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')
-          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.build-and-test-locally.result == 'skipped' && contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.check-codestyle-python.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.check-codestyle-rust.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
           || needs.files-changed.result == 'skipped'
           || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
           || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))

From 9df230c837691d526c3d37d8e167fc492a936baf Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 2 Apr 2025 17:11:52 +0200
Subject: [PATCH 019/140] storcon: improve autosplit defaults (#11332)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

In #11122, we changed the autosplit behavior to allow repeated and
initial splits. The defaults were set such that they retain the current
production settings (8 shards at 64 GB). However, these defaults don't
really make sense by themselves.

Once we deploy new settings to production, we should change the defaults
to something more reasonable.

## Summary of changes

Changes the following default settings:

* `max_split_shards`: 8 → 16
* `initial_split_threshold`: 64 GB → disabled
* `initial_split_shards`: 8 → 2
---
 storage_controller/src/main.rs | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 1a7f9a2366..8c834f9acb 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -115,19 +115,17 @@ struct Cli {
     #[arg(long)]
     split_threshold: Option<u64>,
 
-    /// Maximum number of shards during autosplits. 0 disables autosplits.
-    // TODO: defaults to 8 for backwards compatibility, should default to 255.
-    #[arg(long, default_value = "8")]
+    /// Maximum number of shards during autosplits. 0 disables autosplits. Defaults
+    /// to 16 as a safety to avoid too many shards by accident.
+    #[arg(long, default_value = "16")]
     max_split_shards: u8,
 
     /// Size threshold for initial shard splits of unsharded tenants. 0 disables initial splits.
-    // TODO: defaults to 64 GB for backwards compatibility. Should default to None.
-    #[arg(long, default_value = "68719476736")]
-    initial_split_threshold: u64,
+    #[arg(long)]
+    initial_split_threshold: Option<u64>,
 
-    /// Number of target shards for initial splits. 0 or 1 disables initial splits.
-    // TODO: defaults to 8 for backwards compatibility. Should default to 2.
-    #[arg(long, default_value = "8")]
+    /// Number of target shards for initial splits. 0 or 1 disables initial splits. Defaults to 2.
+    #[arg(long, default_value = "2")]
     initial_split_shards: u8,
 
     /// Maximum number of normal-priority reconcilers that may run in parallel
@@ -417,7 +415,7 @@ async fn async_main() -> anyhow::Result<()> {
         tenant_rate_limit: args.tenant_rate_limit,
         split_threshold: args.split_threshold,
         max_split_shards: args.max_split_shards,
-        initial_split_threshold: Some(args.initial_split_threshold),
+        initial_split_threshold: args.initial_split_threshold,
         initial_split_shards: args.initial_split_shards,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,

From cb19e4e05dfa7cf95b5ff7a04ad5b1d1f1a891ec Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 2 Apr 2025 16:21:58 +0100
Subject: [PATCH 020/140] pageserver: remove legacy
 TimelineInfo::latest_gc_cutoff field (2/2) (#11136)

## Problem

This field was retained for backward compat only in #10707.

Once https://github.com/neondatabase/cloud/pull/25233 is released,
nothing will be reading this field.

Related: https://github.com/neondatabase/cloud/issues/24250

## Summary of changes

- Remove TimelineInfo::latest_gc_cutoff_lsn
---
 libs/pageserver_api/src/models.rs | 5 -----
 pageserver/src/http/routes.rs     | 4 +++-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0a7c9717ca..f2dd3a0ebf 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1418,11 +1418,6 @@ pub struct TimelineInfo {
     pub last_record_lsn: Lsn,
     pub prev_record_lsn: Option<Lsn>,
 
-    /// Legacy field, retained for one version to enable old storage controller to
-    /// decode (it was a mandatory field).
-    #[serde(default, rename = "latest_gc_cutoff_lsn")]
-    pub _unused: Lsn,
-
     /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
     /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
     /// as it is easier to reason about.
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3f36ff9904..3a3b4202dd 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -445,6 +445,9 @@ async fn build_timeline_info_common(
 
     let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
 
+    // Externally, expose the lowest LSN that can be used to create a branch.
+    // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
+    // actually trimmed data to), which can pass each other when PITR is changed.
     let min_readable_lsn = std::cmp::max(
         timeline.get_gc_cutoff_lsn(),
         *timeline.get_applied_gc_cutoff_lsn(),
@@ -461,7 +464,6 @@ async fn build_timeline_info_common(
         initdb_lsn,
         last_record_lsn,
         prev_record_lsn: Some(timeline.get_prev_record_lsn()),
-        _unused: Default::default(), // Unused, for legacy decode only
         min_readable_lsn,
         applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
         current_logical_size: current_logical_size.size_dont_care_about_accuracy(),

From dd1299f33799974df9deb66edb88193289b4b731 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 2 Apr 2025 13:11:49 -0400
Subject: [PATCH 021/140] feat(storcon): passthrough mark invisible and add
 tests (#11401)

## Problem

close https://github.com/neondatabase/neon/issues/11279

## Summary of changes

* Allow passthrough of other methods in tenant timeline shard0
passthrough of storcon.
* Passthrough mark invisible API in storcon.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/client/src/mgmt_api.rs       | 10 +++++-----
 pageserver/src/http/routes.rs           | 18 ++++++++++++++----
 pageserver/src/tenant/timeline.rs       |  6 ++++++
 storage_controller/src/http.rs          | 22 +++++++++++++++++++++-
 test_runner/fixtures/pageserver/http.py | 19 +++++++++++++++++++
 test_runner/regress/test_tenant_size.py | 15 +++++++++++++++
 6 files changed, 80 insertions(+), 10 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 224208034b..e0cd19817d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -86,17 +86,17 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
-    /// Get an arbitrary path and returning a streaming Response.  This function is suitable
-    /// for pass-through/proxy use cases where we don't care what the response content looks
-    /// like.
+    /// Send an HTTP request to an arbitrary path with a desired HTTP method and returning a streaming
+    /// Response.  This function is suitable for pass-through/proxy use cases where we don't care
+    /// what the response content looks like.
     ///
     /// Use/add one of the properly typed methods below if you know aren't proxying, and
     /// know what kind of response you expect.
-    pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
+    pub async fn op_raw(&self, method: Method, path: String) -> Result<reqwest::Response> {
         debug_assert!(path.starts_with('/'));
         let uri = format!("{}{}", self.mgmt_api_endpoint, path);
 
-        let mut req = self.client.request(Method::GET, uri);
+        let mut req = self.client.request(method, uri);
         if let Some(value) = &self.authorization_header {
             req = req.header(reqwest::header::AUTHORIZATION, value);
         }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3a3b4202dd..adc38e32e8 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,8 +74,8 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout,
-    WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
+    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
     GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -2337,21 +2337,31 @@ async fn timeline_compact_handler(
 }
 
 async fn timeline_mark_invisible_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let compact_request = json_request_maybe::<Option<MarkInvisibleRequest>>(&mut request).await?;
+
     let state = get_state(&request);
 
+    let visibility = match compact_request {
+        Some(req) => match req.is_visible {
+            Some(true) => TimelineVisibilityState::Visible,
+            Some(false) | None => TimelineVisibilityState::Invisible,
+        },
+        None => TimelineVisibilityState::Invisible,
+    };
+
     async {
         let tenant = state
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
         let timeline = tenant.get_timeline(timeline_id, true)?;
-        timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(TimelineVisibilityState::Invisible).map_err(ApiError::InternalServerError)?;
+        timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(visibility).map_err(ApiError::InternalServerError)?;
         json_response(StatusCode::OK, ())
     }
     .instrument(info_span!("manual_timeline_mark_invisible", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 751a8acb89..d21a8752a8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -895,6 +895,12 @@ pub(crate) struct CompactRequest {
     pub sub_compaction_max_job_size_mb: Option<u64>,
 }
 
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct MarkInvisibleRequest {
+    #[serde(default)]
+    pub is_visible: Option<bool>,
+}
+
 #[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
     pub flags: EnumSet<CompactFlags>,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 79332ea304..8ec8bccf2c 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -639,6 +639,15 @@ async fn handle_tenant_timeline_passthrough(
         return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
     };
 
+    let method = match *req.method() {
+        hyper::Method::GET => reqwest::Method::GET,
+        hyper::Method::POST => reqwest::Method::POST,
+        hyper::Method::PUT => reqwest::Method::PUT,
+        hyper::Method::DELETE => reqwest::Method::DELETE,
+        hyper::Method::PATCH => reqwest::Method::PATCH,
+        _ => return Err(ApiError::BadRequest(anyhow::anyhow!("Unsupported method"))),
+    };
+
     tracing::info!(
         "Proxying request for tenant {} ({})",
         tenant_or_shard_id.tenant_id,
@@ -686,7 +695,7 @@ async fn handle_tenant_timeline_passthrough(
         node.base_url(),
         service.get_config().pageserver_jwt_token.as_deref(),
     );
-    let resp = client.get_raw(path).await.map_err(|e|
+    let resp = client.op_raw(method, path).await.map_err(|e|
         // We return 503 here because if we can't successfully send a request to the pageserver,
         // either we aren't available or the pageserver is unavailable.
         ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
@@ -2247,6 +2256,17 @@ pub fn make_router(
                 RequestName("v1_tenant_passthrough"),
             )
         })
+        // Tenant timeline mark_invisible passthrough to shard zero
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_passthrough,
+                    RequestName("v1_tenant_timeline_mark_invisible_passthrough"),
+                )
+            },
+        )
 }
 
 #[cfg(test)]
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 347bceb785..8211da32fe 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -853,6 +853,25 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res_json = res.json()
         return res_json
 
+    def timeline_mark_invisible(
+        self,
+        tenant_id: TenantId | TenantShardId,
+        timeline_id: TimelineId,
+        is_visible: bool | None = None,
+    ):
+        data = {
+            "is_visible": is_visible,
+        }
+
+        log.info(
+            f"Requesting marking timeline invisible for {is_visible=}, {tenant_id=}, {timeline_id=}"
+        )
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/mark_invisible",
+            json=data,
+        )
+        self.verbose_error(res)
+
     def timeline_get_timestamp_of_lsn(
         self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
     ):
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index a9df5f2d49..0cb22905b0 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -782,6 +782,21 @@ def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder):
     )
 
 
+def test_mark_invisible_storcon(neon_env_builder: NeonEnvBuilder):
+    conf = {
+        "pitr_interval": "0s",
+        "gc_period": "0s",
+        "compaction_period": "0s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+    env.storage_controller.pageserver_api().timeline_mark_invisible(
+        env.initial_tenant, env.initial_timeline
+    )
+    env.storage_controller.pageserver_api().timeline_mark_invisible(
+        env.initial_tenant, env.initial_timeline, True
+    )
+
+
 def insert_with_action(
     env: NeonEnv,
     tenant: TenantId,

From e3d27b2f68585121ff5a79902cfe68af448f33c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 2 Apr 2025 20:36:50 +0200
Subject: [PATCH 022/140] Start safekeeper node IDs with 0 and forbid 0 from
 registering (#11419)

Right now we start safekeeper node ids at 0. However, other code treats
0 as invalid (see #11407). We decided on latter. Therefore, make the
register python tests register safekeepers starting at node id 1 instead
of 0, and forbid safekeepers with id 0 from registering.

Context:
https://github.com/neondatabase/neon/pull/11407#discussion_r2024852328
---
 storage_controller/src/http.rs        | 6 ++++++
 test_runner/fixtures/neon_fixtures.py | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 8ec8bccf2c..4035a15316 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1416,6 +1416,12 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
         )));
     }
 
+    if id <= 0 {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "id not allowed to be zero or negative: {id}"
+        )));
+    }
+
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
             return res;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 34a841f59f..7931a0a7d0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1344,6 +1344,8 @@ class NeonEnv:
             and self.storage_controller_config.get("timelines_onto_safekeepers") is True
         ):
             for sk_id, sk in enumerate(self.safekeepers):
+                # 0 is an invalid safekeeper id
+                sk_id = sk_id + 1
                 body = {
                     "id": sk_id,
                     "created_at": "2023-10-25T09:11:25Z",

From 03ae57236f23d0bca801d0681cac308a4f0e6e98 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 2 Apr 2025 21:55:08 +0200
Subject: [PATCH 023/140] docs: add compaction notes (#11415)

Lifted from
https://www.notion.so/neondatabase/Rough-Notes-on-Compaction-1baf189e004780859e65ef63b85cfa81?pvs=4.
---
 docs/SUMMARY.md               |   1 +
 docs/pageserver-compaction.md | 110 ++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 docs/pageserver-compaction.md

diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index 5fd4080c28..a6e2ac0f34 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -21,6 +21,7 @@ in this repository.
     - [WAL Redo](./pageserver-walredo.md)
     - [Page cache](./pageserver-pagecache.md)
     - [Storage](./pageserver-storage.md)
+    - [Compaction](./pageserver-compaction.md)
     - [Processing a GetPage request](./pageserver-processing-getpage.md)
     - [Processing WAL](./pageserver-processing-wal.md)
 
diff --git a/docs/pageserver-compaction.md b/docs/pageserver-compaction.md
new file mode 100644
index 0000000000..6cacb10c9c
--- /dev/null
+++ b/docs/pageserver-compaction.md
@@ -0,0 +1,110 @@
+# Pageserver Compaction
+
+Lifted from <https://www.notion.so/neondatabase/Rough-Notes-on-Compaction-1baf189e004780859e65ef63b85cfa81?pvs=4>.
+
+Updated 2025-03-26.
+
+## Pages and WAL
+
+Postgres stores data in 8 KB pages, identified by a page number.
+
+The WAL contains a sequence of page writes: either images (complete page contents) or deltas (patches applied to images). Each write is identified by its byte position in the WAL, aka LSN. 
+
+Each page version is thus identified by page@LSN. Postgres may read pages at past LSNs.
+
+Pageservers ingest WAL by writing WAL records into a key/value store keyed by page@LSN.
+
+Pageservers materialize pages for Postgres reads by finding the most recent page image and applying all subsequent page deltas, up to the read LSN.
+
+## Compaction: Why?
+
+Pageservers store page@LSN keys in a key/value store using a custom variant of an LSM tree. Each timeline on each tenant shard has its own LSM tree.
+
+When Pageservers write new page@LSN entries, they are appended unordered to an ephemeral layer file. When the ephemeral layer file exceeds `checkpoint_distance` (default 256 MB), the key/value pairs are sorted by key and written out to a layer file (for efficient lookups).
+
+As WAL writes continue, more layer files accumulate.
+
+Reads must search through the layer files to find the page’s image and deltas. The more layer files accumulate, the more la yer files reads must search through before they find a page image, aka read amplification.
+
+Compaction’s job is to:
+
+- Reduce read amplification by reorganizing and combining layer files.
+- Remove old garbage from layer files.
+
+As part of this, it may combine several page deltas into a single page image where possible.
+
+## Compaction: How?
+
+Neon uses a non-standard variant of an LSM tree made up of two levels of layer files: L0 and L1.
+
+Compaction runs in two phases: L0→L1 compaction, and L1 image compaction.
+
+L0 contains a stack of L0 layers at decreasing LSN ranges. These have been flushed sequentially from ephemeral layers. Each L0 layer covers the entire page space (page 0 to ~infinity) and the LSN range that was ingested into it. L0 layers are therefore particularly bad for read amp, since every read must search all L0 layers below the read LSN. For example:
+
+```
+| Page 0-99 @ LSN 0400-04ff |
+| Page 0-99 @ LSN 0300-03ff |
+| Page 0-99 @ LSN 0200-02ff |
+| Page 0-99 @ LSN 0100-01ff |
+| Page 0-99 @ LSN 0000-00ff |
+```
+
+L0→L1 compaction takes the bottom-most chunk of L0 layer files of between `compaction_threshold` (default 10) and `compaction_upper_limit` (default 20) layers. It uses merge-sort to write out sorted L1 delta layers of size `compaction_target_size` (default 128 MB).
+
+L1 typically consists of a “bed” of image layers with materialized page images at a specific LSN, and then delta layers of various page/LSN ranges above them with page deltas. For example:
+
+```
+Delta layers:               |     30-84@0310-04ff      |
+Delta layers:    | 10-42@0200-02ff |           | 65-92@0174-02aa |
+Image layers: |    0-39@0100    |    40-79@0100    |    80-99@0100    |
+```
+
+L1 image compaction scans across the L1 keyspace at some LSN, materializes page images by reading the image and delta layers below the LSN (via vectored reads), and writes out new sorted image layers of roughly size `compaction_target_size` (default 128 MB) at that LSN.
+
+Layer files below the new image files’ LSN can be garbage collected when they are no longer needed for PITR.
+
+Even though the old layer files are not immediately garbage collected, the new image layers help with read amp because reads can stop traversing the layer stack as soon as they encounter a page image.
+
+## Compaction: When?
+
+Pageservers run a `compaction_loop` background task for each tenant shard. Every `compaction_period` (default 20 seconds) it will wake up and check if any of the shard’s timelines need compaction. Additionally, L0 layer flushes will eagerly wake the compaction loop if the L0 count exceeds `compaction_threshold` (default 10).
+
+L0 compaction runs if the number of L0 layers exceeds `compaction_threshold` (default 10).
+
+L1 image compaction runs across sections of the L1 keyspace that have at least `image_creation_threshold` (default 3) delta layers overlapping image layers.
+
+At most `CONCURRENT_BACKGROUND_TASKS` (default 3 / 4 * CPUs = 6) background tasks can run concurrently on a Pageserver, including compaction. Further compaction tasks must wait.
+
+Because L0 layers cause the most read amp (they overlap the entire keyspace and only contain page deltas), they are aggressively compacted down:
+
+- L0 is compacted down across all tenant timelines before L1 compaction is attempted (`compaction_l0_first`).
+- L0 compaction uses a separate concurrency limit of `CONCURRENT_L0_COMPACTION_TASKS` (default 3 / 4 * CPUs = 6) to avoid waiting for other tasks (`compaction_l0_semaphore`).
+- If L0 compaction is needed on any tenant timeline, L1 image compaction will yield to start an immediate L0 compaction run (except for compaction run via admin APIs).
+
+## Backpressure
+
+With sustained heavy write loads, new L0 layers may be flushed faster than they can be compacted down. This can cause an unbounded buildup of read amplification and compaction debt, which can take hours to resolve even after the writes stop.
+
+To avoid this and allow compaction to keep up, layer flushes will slow writes down to apply backpressure on the workload:
+
+- At `l0_flush_delay_threshold` (default 30) L0 layers, layer flushes are delayed by the flush duration, such that they take 2x as long.
+- At `l0_flush_stall_threshold` (default disabled) L0 layers, layer flushes stall entirely until the L0 count falls back below the threshold. This is currently disabled because we don’t trust L0 compaction to be responsive enough.
+
+This backpressure is propagated to the compute by waiting for layer flushes when WAL ingestion rolls the ephemeral layer. The compute will significantly slow down WAL writes at:
+
+- `max_replication_write_lag` (default 500 MB), when Pageserver WAL ingestion lags
+- `max_replication_flush_lag` (default 10 GB), when Pageserver L0 flushes lag
+
+Combined, this means that the compute will backpressure when there are 30 L0 layers (30 * 256 MB = 7.7 GB) and the Pageserver WAL ingestion lags the compute by 500 MB, for a total of ~8 GB L0+ephemeral compaction debt on a single shard.
+
+Since we only delay L0 flushes by 2x when backpressuring, and haven’t enabled stalls, it is still possible for read amp to increase unbounded if compaction is too slow (although we haven’t seen this in practice). But this is considered better than stalling flushes and causing unavailability for as long as it takes L0 compaction to react, since we don’t trust it to be fast enough — at the expense of continually increasing read latency and CPU usage for this tenant. We should either enable stalls when we have enough confidence in L0 compaction, or scale the flush delay by the number of L0 layers to apply increasing backpressure.
+
+## Circuit Breaker
+
+Compaction can fail, often repeatedly. This can happen e.g. due to data corruption, faulty hardware, S3 outages, etc.
+
+If compaction fails, the compaction loop will naïvely try and fail again almost immediately. It may only fail after doing a significant amount of wasted work, while holding onto the background task semaphore.
+
+To avoid repeatedly doing wasted work and starving out other compaction jobs, each tenant has a compaction circuit breaker. After 5 repeated compaction failures, the circuit breaker trips and disables compaction for the next 24 hours, before resetting the breaker and trying again. This disables compaction across all tenant timelines (faulty or not).
+
+Disabling compaction for a long time is dangerous, since it can lead to unbounded read amp and compaction debt, and continuous workload backpressure. However, continually failing would not help either. Tripped circuit breakers trigger an alert and must be investigated promptly.
\ No newline at end of file

From 17193d6a33838eada634e0b45fc819d3254ead7a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 2 Apr 2025 22:11:39 +0200
Subject: [PATCH 024/140] test_runner: fix pagebench tenant configs (#11420)

## Problem

Pagebench creates a bunch of tenants by first creating a template tenant
and copying its remote storage, then attaching the copies to the
Pageserver.

These tenants had custom configurations to disable GC and compaction.
However, these configs were only picked up by the Pageserver on attach,
and not registered with the storage controller. This caused the storage
controller to replace the tenant configs with the default tenant config,
re-enabling GC and compaction which interferes with benchmark
performance.

Resolves #11381.

## Summary of changes

Register the copied tenants with the storage controller, instead of
directly attaching them to the Pageserver.
---
 test_runner/fixtures/pageserver/many_tenants.py | 14 ++++++++------
 ...rver_max_throughput_getpage_at_latest_lsn.py | 17 +++--------------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 4b066d6cf3..eedb693e3d 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -43,7 +43,7 @@ def single_timeline(
         f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}"
     )
 
-    log.info("detach template tenant form pageserver")
+    log.info("detach template tenant from pageserver")
     env.pageserver.tenant_detach(template_tenant)
 
     log.info(f"duplicating template tenant {ncopies} times in remote storage")
@@ -65,11 +65,13 @@ def single_timeline(
     assert ps_http.tenant_list() == []
 
     def attach(tenant):
-        env.pageserver.tenant_attach(
-            tenant,
-            config=template_config.copy(),
-            generation=100,
-            override_storage_controller_generation=True,
+        # NB: create the new tenant in the storage controller with the correct tenant config. This
+        # will pick up the existing tenant data from remote storage. If we just attach it to the
+        # Pageserver, the storage controller will reset the tenant config to the default.
+        env.create_tenant(
+            tenant_id=tenant,
+            timeline_id=template_timeline,
+            conf=template_config,
         )
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 3dbbb197f4..6cbbad4bd9 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -15,9 +15,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci
 
-from performance.pageserver.util import (
-    setup_pageserver_with_tenants,
-)
+from performance.pageserver.util import setup_pageserver_with_tenants
 
 if TYPE_CHECKING:
     from typing import Any
@@ -126,14 +124,11 @@ def setup_and_run_pagebench_benchmark(
     for param, (value, kwargs) in params.items():
         record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
 
-    def setup_wrapper(env: NeonEnv):
-        return setup_tenant_template(env, pg_bin, pgbench_scale)
-
     env = setup_pageserver_with_tenants(
         neon_env_builder,
         f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
         n_tenants,
-        setup_wrapper,
+        lambda env: setup_tenant_template(env, pg_bin, pgbench_scale),
         # https://github.com/neondatabase/neon/issues/8070
         timeout_in_seconds=60,
     )
@@ -160,14 +155,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
         "gc_period": "0s",  # disable periodic gc
         "checkpoint_timeout": "10 years",
         "compaction_period": "0s",  # disable periodic compaction
-        "compaction_threshold": 10,
-        "compaction_target_size": 134217728,
-        "checkpoint_distance": 268435456,
-        "image_creation_threshold": 3,
     }
-    template_tenant, template_timeline = env.create_tenant(set_default=True)
-    env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.tenant_attach(template_tenant, config)
+    template_tenant, template_timeline = env.create_tenant(set_default=True, conf=config)
     ps_http = env.pageserver.http_client()
     with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])

From 7602e6ffc0fb39d7b29c0c196ecabbee7f9d757e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 2 Apr 2025 19:00:28 -0500
Subject: [PATCH 025/140] Skip compute_ctl authorization checks in testing
 builds (#11186)

We will require authorization in production. We need to skip in testing
builds for now because regression tests would fail. See
https://github.com/neondatabase/neon/issues/11316 for more information.

Signed-off-by: Tristan Partin <tristan@neon.tech>

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/middleware/authorize.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index 798dd1179b..89d55e1af3 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -59,9 +59,12 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
         Box::pin(async move {
             let request_id = request.extract_parts::<RequestId>().await.unwrap();
 
-            // TODO: Remove this check after a successful rollout
-            if jwks.keys.is_empty() {
-                warn!(%request_id, "Authorization has not been configured");
+            // TODO: Remove this stanza after teaching neon_local and the
+            // regression tests to use a JWT + JWKS.
+            //
+            // https://github.com/neondatabase/neon/issues/11316
+            if cfg!(feature = "testing") {
+                warn!(%request_id, "Skipping compute_ctl authorization check");
 
                 return Ok(request);
             }
@@ -110,8 +113,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
 impl Authorize {
     /// Verify the token using the JSON Web Key set and return the token data.
     fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
-        debug_assert!(!jwks.keys.is_empty());
-
         for jwk in jwks.keys.iter() {
             let decoding_key = match DecodingKey::from_jwk(jwk) {
                 Ok(key) => key,

From 64a8d0c2e6a6d401132021cdf2e1a1d58dd2f7a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Thu, 3 Apr 2025 10:59:42 +0200
Subject: [PATCH 026/140] impr(ci): retry container image pushing and send
 slack messages for failures (#11416)

## Problem
We've seen quite a few CI failures related to pushes to docker hub
failing with weird error messages that indicate maybe docker hub is just
not reliable.

## Summary of changes
Retry container image pushing up to 10 times, and send a slack message
if we had to retry, regardless of the job succeeding or not.
---
 .github/scripts/push_with_image_map.py        | 31 ++++++++++++++-----
 .../workflows/_push-to-container-registry.yml | 12 +++++++
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/.github/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py
index c68f6ad407..53f83379ae 100644
--- a/.github/scripts/push_with_image_map.py
+++ b/.github/scripts/push_with_image_map.py
@@ -11,12 +11,27 @@ try:
 except json.JSONDecodeError as e:
     raise ValueError("Failed to parse IMAGE_MAP as JSON") from e
 
-for source, targets in parsed_image_map.items():
-    for target in targets:
-        cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
-        print(f"Running: {' '.join(cmd)}")
-        result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+failures = []
 
-        if result.returncode != 0:
-            print(f"Error: {result.stdout}")
-            raise RuntimeError(f"Command failed: {' '.join(cmd)}")
+pending = [(source, target) for source, targets in parsed_image_map.items() for target in targets]
+
+while len(pending) > 0:
+    if len(failures) > 10:
+        print("Error: more than 10 failures!")
+        for failure in failures:
+            print(f'"{failure[0]}" failed with the following output:')
+            print(failure[1])
+        raise RuntimeError("Retry limit reached.")
+
+    source, target = pending.pop(0)
+    cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    if result.returncode != 0:
+        failures.append((" ".join(cmd), result.stdout))
+        pending.append((source, target))
+
+if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")):
+    with open(github_output, "a") as f:
+        f.write("slack_notify=true\n")
diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml
index 9a7da612d4..9b3ad0fdbb 100644
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -104,6 +104,18 @@ jobs:
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
       - name: Copy docker images to target registries
+        id: push
         run: python3 .github/scripts/push_with_image_map.py
         env:
           IMAGE_MAP: ${{ inputs.image-map }}
+
+      - name: Notify Slack if container image pushing fails
+        if: steps.push.outputs.slack_notify == 'true' || failure()
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
+            text: |
+              Pushing container images failed in <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>

From 4e8e0951be4b24a31bfa4a95fca11c8a6e4e7851 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Thu, 3 Apr 2025 14:23:30 +0300
Subject: [PATCH 027/140] Increase timeout for
 test_pageserver_gc_compaction_smoke (#11410)

## Problem
The test_pageserver_gc_compaction_smoke fails rather often due to a
timeout on slow machines.
See https://github.com/neondatabase/neon/issues/11355.

## Summary of changes
Increase the timeout for the test.
---
 test_runner/regress/test_compaction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index e5f5b80d2d..6789939e0c 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -166,6 +166,7 @@ def test_pageserver_compaction_preempt(
 
 
 @skip_in_debug_build("only run with release build")
+@pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
 @pytest.mark.parametrize(
     "with_branches",
     ["with_branches", "no_branches"],

From 5e507776bcabed6fe6fb61ff74c1a8afb42dd021 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 3 Apr 2025 12:29:53 +0100
Subject: [PATCH 028/140] Compute: update plv8 patch (#11426)

## Problem

https://github.com/neondatabase/cloud/issues/26866

## Summary of changes
- Update plv8 patch

Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
---
 compute/compute-node.Dockerfile               |  4 +--
 .../{plv8-3.1.10.patch => plv8_v3.1.10.patch} | 25 ++++++++++++-------
 compute/patches/plv8_v3.2.3.patch             | 13 ++++++++++
 3 files changed, 31 insertions(+), 11 deletions(-)
 rename compute/patches/{plv8-3.1.10.patch => plv8_v3.1.10.patch} (80%)
 create mode 100644 compute/patches/plv8_v3.2.3.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1fe2570e87..a53b380b3f 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -369,7 +369,7 @@ FROM build-deps AS plv8-src
 ARG PG_VERSION
 WORKDIR /ext-src
 
-COPY compute/patches/plv8-3.1.10.patch .
+COPY compute/patches/plv8* .
 
 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -393,7 +393,7 @@ RUN case "${PG_VERSION:?}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
-    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi
 
 # Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
 # 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
diff --git a/compute/patches/plv8-3.1.10.patch b/compute/patches/plv8_v3.1.10.patch
similarity index 80%
rename from compute/patches/plv8-3.1.10.patch
rename to compute/patches/plv8_v3.1.10.patch
index 43cdb479f7..5cf96426d0 100644
--- a/compute/patches/plv8-3.1.10.patch
+++ b/compute/patches/plv8_v3.1.10.patch
@@ -1,12 +1,6 @@
-commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e
-Author: Alexander Bayandin <alexander@neon.tech>
-Date:   Sat Nov 30 18:29:32 2024 +0000
-
-    Fix v8 9.7.37 compilation on Debian 12
-
 diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
 new file mode 100644
-index 0000000..f0a5dc7
+index 0000000..fae1cb3
 --- /dev/null
 +++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
 @@ -0,0 +1,30 @@
@@ -35,8 +29,21 @@ index 0000000..f0a5dc7
 +@@ -5,6 +5,7 @@
 + #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
 + #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
-+ 
++
 ++#include <utility>
 + #include <vector>
-+ 
++
 + #include "include/cppgc/prefinalizer.h"
+diff --git a/plv8.cc b/plv8.cc
+index c1ce883..6e47e94 100644
+--- a/plv8.cc
++++ b/plv8.cc
+@@ -379,7 +379,7 @@ _PG_init(void)
+ 							   NULL,
+ 							   &plv8_v8_flags,
+ 							   NULL,
+-							   PGC_USERSET, 0,
++							   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+ 							   NULL,
+ #endif
diff --git a/compute/patches/plv8_v3.2.3.patch b/compute/patches/plv8_v3.2.3.patch
new file mode 100644
index 0000000000..5cf4ae2fa2
--- /dev/null
+++ b/compute/patches/plv8_v3.2.3.patch
@@ -0,0 +1,13 @@
+diff --git a/plv8.cc b/plv8.cc
+index edfa2aa..623e7f2 100644
+--- a/plv8.cc
++++ b/plv8.cc
+@@ -385,7 +385,7 @@ _PG_init(void)
+                                    NULL,
+                                    &plv8_v8_flags,
+                                    NULL,
+-                                   PGC_USERSET, 0,
++                                   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+                                    NULL,
+ #endif

From 374736a4deed9531a8a2a45fcf8fc451c4967a71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:58:12 +0200
Subject: [PATCH 029/140] Print remote_addr span for Failed to serve HTTP
 connection error (#11423)

I've encountered this error in #11422. Ideally we'd have the URL as well
to associate it with a tenant, but at this level we only have the remote
addr I guess. Better than nothing.
---
 libs/http-utils/src/server.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs
index 33e4915e99..07fd56ac01 100644
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -91,14 +91,14 @@ impl Server {
                                         Ok(tls_stream) => tls_stream,
                                         Err(err) => {
                                             if !suppress_io_error(&err) {
-                                                info!("Failed to accept TLS connection: {err:#}");
+                                                info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                             }
                                             return;
                                         }
                                     };
                                     if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
                                         if !suppress_hyper_error(&err) {
-                                            info!("Failed to serve HTTPS connection: {err:#}");
+                                            info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                         }
                                     }
                                 }
@@ -106,7 +106,7 @@ impl Server {
                                     // Handle HTTP connection.
                                     if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
                                         if !suppress_hyper_error(&err) {
-                                            info!("Failed to serve HTTP connection: {err:#}");
+                                            info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                         }
                                     }
                                 }

From 43a7423f7212da3367fc3f7481cf366cb28ffc79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Tavares?=
 <34719043+luist18@users.noreply.github.com>
Date: Thu, 3 Apr 2025 14:01:18 +0100
Subject: [PATCH 030/140] feat: bump pg_session_jwt extension to 0.3.0 (#11399)

## Problem

Bumps https://github.com/neondatabase/pg_session_jwt to the latest
release
[v0.3.0](https://github.com/neondatabase/pg_session_jwt/releases/tag/v0.3.0)
that introduces PostgREST fallback mechanisms.

## Summary of changes

Updates the extension download tar and the extension version in the
proxy constant.

## Subscribers
@mrl5
---
 compute/compute-node.Dockerfile         | 4 ++--
 proxy/src/serverless/local_conn_pool.rs | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a53b380b3f..417f5ce6da 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1366,8 +1366,8 @@ ARG PG_VERSION
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
 WORKDIR /ext-src
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index c958d077fc..3282c0ebde 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.2.0";
+pub(crate) const EXT_VERSION: &str = "0.3.0";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
 #[derive(Clone)]

From 2e11d129d01ba49315e63bac85493d6d0549ac67 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Thu, 3 Apr 2025 18:18:50 +0400
Subject: [PATCH 031/140] tests: suppress mgm api timeout error in sotrcon
 (#11428)

## Problem
Since
https://github.com/neondatabase/neon/commit/0f367cb6650b7ae088729e1703814628f9eccf5d
the timeout in `with_client_retries` is implemented via `tokio::timeout`
instead of `reqwest::ClientBuilder::timeout` (because we reuse the
client). It changed the error representation if the timeout is exceeded.
Such errors were suppressed in `allowed_errors.py`, but old regexps do
not match the new error.
Discussion:
https://neondb.slack.com/archives/C033RQ5SPDH/p1743533184736319

## Summary of changes
- Add new `Timeout` error to `allowed_errors.py`
---
 test_runner/fixtures/pageserver/allowed_errors.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index c1c5f470cc..27ae5507b1 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -118,6 +118,7 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # failing to connect to them.
     ".*Call to node.*management API.*failed.*receive body.*",
     ".*Call to node.*management API.*failed.*ReceiveBody.*",
+    ".*Call to node.*management API.*failed.*Timeout.*",
     ".*Failed to update node .+ after heartbeat round.*error sending request for url.*",
     ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*",
     # Many tests will start up with a node offline

From d8cee526377642ba156b4dd3ddcc5ac10530ae91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 3 Apr 2025 16:53:28 +0200
Subject: [PATCH 032/140] Update rust to 1.86.0 (#11431)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Announcement blog
post](https://blog.rust-lang.org/2025/04/03/Rust-1.86.0.html).

Prior update was in #10914.
---
 build-tools.Dockerfile                                      | 2 +-
 compute_tools/src/spec_apply.rs                             | 2 +-
 libs/remote_storage/tests/test_real_s3.rs                   | 2 +-
 pageserver/src/task_mgr.rs                                  | 3 +--
 pageserver/src/tenant.rs                                    | 2 +-
 pageserver/src/tenant/layer_map/layer_coverage.rs           | 2 +-
 pageserver/src/tenant/mgr.rs                                | 2 +-
 pageserver/src/tenant/remote_timeline_client/index.rs       | 2 +-
 pageserver/src/tenant/timeline.rs                           | 2 +-
 .../owned_buffers_io/aligned_buffer/buffer_mut.rs           | 4 ++--
 .../src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs | 4 ++--
 rust-toolchain.toml                                         | 2 +-
 safekeeper/src/receive_wal.rs                               | 6 +++---
 safekeeper/src/timeline.rs                                  | 2 +-
 safekeeper/tests/misc_test.rs                               | 2 +-
 storage_controller/src/heartbeater.rs                       | 4 ++--
 storage_controller/src/main.rs                              | 4 +---
 storage_controller/src/tenant_shard.rs                      | 2 +-
 storage_scrubber/src/find_large_objects.rs                  | 2 +-
 19 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index c103ceaea5..7766991a0a 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.85.0
+ENV RUSTC_VERSION=1.86.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index 2be6458fb4..e7d67f6ac5 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -419,7 +419,7 @@ impl ComputeNode {
                 .iter()
                 .filter_map(|val| val.parse::<usize>().ok())
                 .map(|val| if val > 1 { val - 1 } else { 1 })
-                .last()
+                .next_back()
                 .unwrap_or(3)
         }
     }
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 6996bb27ae..d38e13fd05 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -558,7 +558,7 @@ async fn upload_large_enough_file(
 ) -> usize {
     let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
     let body = bytes::Bytes::from(vec![0u8; 1024]);
-    let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
+    let contents = std::iter::once(header).chain(std::iter::repeat_n(body, 128));
 
     let len = contents.clone().fold(0, |acc, next| acc + next.len());
 
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 9cc604f86d..d4873e60a1 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -219,8 +219,7 @@ pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
 pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
 pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
 // Bump this number when adding a new pageserver_runtime!
-// SAFETY: it's obviously correct
-const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
+const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = NonZeroUsize::new(4).unwrap();
 
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15853133d6..0384fcc39f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3689,7 +3689,7 @@ impl Tenant {
                         }
                     }
                 }
-                TenantState::Active { .. } => {
+                TenantState::Active => {
                     return Ok(());
                 }
                 TenantState::Broken { reason, .. } => {
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
index cf0085c071..a42ac92973 100644
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -53,7 +53,7 @@ impl<Value: Clone> LayerCoverage<Value> {
     ///
     /// Complexity: O(log N)
     fn add_node(&mut self, key: i128) {
-        let value = match self.nodes.range(..=key).last() {
+        let value = match self.nodes.range(..=key).next_back() {
             Some((_, Some(v))) => Some(v.clone()),
             Some((_, None)) => None,
             None => None,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 61ad682a14..ac81b8e3d7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -58,7 +58,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 
 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
-///    reads and ingest WAL.
+///   reads and ingest WAL.
 /// - `Secondary`: is only keeping a local cache warm.
 ///
 /// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 5635cf3268..a5cd8989aa 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -130,7 +130,7 @@ impl IndexPart {
     /// Version history
     /// - 2: added `deleted_at`
     /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
-    ///      is always generated from the keys of `layer_metadata`)
+    ///   is always generated from the keys of `layer_metadata`)
     /// - 4: timeline_layers is fully removed.
     /// - 5: lineage was added
     /// - 6: last_aux_file_policy is added.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d21a8752a8..80a23bfa94 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2247,7 +2247,7 @@ impl Timeline {
                         .await
                         .expect("holding a reference to self");
                 }
-                TimelineState::Active { .. } => {
+                TimelineState::Active => {
                     return Ok(());
                 }
                 TimelineState::Broken { .. } | TimelineState::Stopping => {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
index df5c911e50..3ee1a3c162 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -25,8 +25,8 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
     /// * `align` must be a power of two,
     ///
     /// * `capacity`, when rounded up to the nearest multiple of `align`,
-    ///    must not overflow isize (i.e., the rounded value must be
-    ///    less than or equal to `isize::MAX`).
+    ///   must not overflow isize (i.e., the rounded value must be
+    ///   less than or equal to `isize::MAX`).
     pub fn with_capacity(capacity: usize) -> Self {
         AlignedBufferMut {
             raw: RawAlignedBuffer::with_capacity(capacity),
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
index 97a6c4049a..d273772411 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
@@ -37,8 +37,8 @@ impl<const A: usize> RawAlignedBuffer<ConstAlign<A>> {
     /// * `align` must be a power of two,
     ///
     /// * `capacity`, when rounded up to the nearest multiple of `align`,
-    ///    must not overflow isize (i.e., the rounded value must be
-    ///    less than or equal to `isize::MAX`).
+    ///   must not overflow isize (i.e., the rounded value must be
+    ///   less than or equal to `isize::MAX`).
     pub fn with_capacity(capacity: usize) -> Self {
         let align = ConstAlign::<A>;
         let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout");
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 591d60ea79..a0d5970bd5 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.85.0"
+channel = "1.86.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 7967acde3f..9975153f6c 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -94,10 +94,10 @@ impl WalReceivers {
 
     /// Get reference to locked slot contents. Slot must exist (registered
     /// earlier).
-    fn get_slot<'a>(
-        self: &'a Arc<WalReceivers>,
+    fn get_slot(
+        self: &Arc<WalReceivers>,
         id: WalReceiverId,
-    ) -> MappedMutexGuard<'a, WalReceiverState> {
+    ) -> MappedMutexGuard<'_, WalReceiverState> {
         MutexGuard::map(self.mutex.lock(), |locked| {
             locked.slots[id]
                 .as_mut()
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index d3c841ec09..d9ca58104e 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -699,7 +699,7 @@ impl Timeline {
     }
 
     /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
+    pub async fn write_shared_state(self: &Arc<Self>) -> WriteGuardSharedState<'_> {
         WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
     }
 
diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs
index 8e54d2bb86..3acf9f72c4 100644
--- a/safekeeper/tests/misc_test.rs
+++ b/safekeeper/tests/misc_test.rs
@@ -116,7 +116,7 @@ fn test_many_tx() -> anyhow::Result<()> {
             }
             None
         })
-        .last()
+        .next_back()
         .unwrap();
 
     let initdb_lsn = 21623024;
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 524225c14a..732c4ea443 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -253,7 +253,7 @@ impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState>
                 PageserverState::WarmingUp { .. } => {
                     warming_up += 1;
                 }
-                PageserverState::Offline { .. } => offline += 1,
+                PageserverState::Offline => offline += 1,
                 PageserverState::Available { .. } => {}
             }
         }
@@ -391,7 +391,7 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
         let mut offline = 0;
         for state in new_state.values() {
             match state {
-                SafekeeperState::Offline { .. } => offline += 1,
+                SafekeeperState::Offline => offline += 1,
                 SafekeeperState::Available { .. } => {}
             }
         }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 8c834f9acb..1aa9ae10ae 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -283,10 +283,8 @@ impl Secrets {
     fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
         if let Some(v) = cli {
             Some(v.clone())
-        } else if let Ok(v) = std::env::var(env_name) {
-            Some(v)
         } else {
-            None
+            std::env::var(env_name).ok()
         }
     }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index f6b748844a..8424c65aba 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -622,7 +622,7 @@ impl TenantShard {
             .collect::<Vec<_>>();
 
         attached_locs.sort_by_key(|i| i.1);
-        if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
+        if let Some((node_id, _gen)) = attached_locs.into_iter().next_back() {
             self.intent.set_attached(scheduler, Some(*node_id));
         }
 
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index efb05fb55e..a4ca68d378 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -18,7 +18,7 @@ enum LargeObjectKind {
 
 impl LargeObjectKind {
     fn from_key(key: &str) -> Self {
-        let fname = key.split('/').last().unwrap();
+        let fname = key.split('/').next_back().unwrap();
 
         let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else {
             return LargeObjectKind::Other;

From 46e046e779f4cb97938668fe62967b2281333cde Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:54:45 -0400
Subject: [PATCH 033/140] Exporting `file_cache_used` to calculate LFC
 utilization (#11384)

## Problem

Exporting `file_cache_used` which specifies the number of used chunks in
the LFC. This helps calculate LFC utilization as: `file_cache_used_pages
/ (file_cache_used * file_cache_chunk_size_pages)`

## Summary of changes

Exporting `file_cache_used`.

Related Issue: https://github.com/neondatabase/cloud/issues/26688
---
 compute/etc/neon_collector.jsonnet                |  1 +
 compute/etc/sql_exporter/lfc_used_pages.libsonnet | 10 ++++++++++
 compute/etc/sql_exporter/lfc_used_pages.sql       |  1 +
 3 files changed, 12 insertions(+)
 create mode 100644 compute/etc/sql_exporter/lfc_used_pages.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_used_pages.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index da2b86d542..449e1199d0 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -33,6 +33,7 @@
     import 'sql_exporter/lfc_hits.libsonnet',
     import 'sql_exporter/lfc_misses.libsonnet',
     import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_used_pages.libsonnet',
     import 'sql_exporter/lfc_writes.libsonnet',
     import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
     import 'sql_exporter/max_cluster_size.libsonnet',
diff --git a/compute/etc/sql_exporter/lfc_used_pages.libsonnet b/compute/etc/sql_exporter/lfc_used_pages.libsonnet
new file mode 100644
index 0000000000..1e39a93482
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_used_pages.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_used_pages',
+  type: 'gauge',
+  help: 'LFC pages used',
+  key_labels: null,
+  values: [
+    'lfc_used_pages',
+  ],
+  query: importstr 'sql_exporter/lfc_used_pages.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_used_pages.sql b/compute/etc/sql_exporter/lfc_used_pages.sql
new file mode 100644
index 0000000000..56d92f8514
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_used_pages.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages';

From 3c781334772578558e8a75eba2b03fd0d0da1a34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Thu, 3 Apr 2025 16:57:44 +0200
Subject: [PATCH 034/140]     feat(ci): add 'released' tag to container images
 from release runs (#11425)

## Problem
We had a problem with https://github.com/neondatabase/neon/pull/11413
having e2e tests failing, because an e2e test
(https://github.com/neondatabase/cloud/commit/8d271bed47498c83b35ff9ace9f7938e6e0f19f3)
depended on an unreleased pageserver fix
(https://github.com/neondatabase/neon/commit/0ee5bfa2fc01372737fe0e108ae40b3e6f5801a7).
This came up because neon release CI runs against the most recent
releases of the other components, but cloud e2e tests run against
latest, which is tagged from main.

## Summary of changes
Add an additional `released` tag for released versions.

## Alternative to consider
We could (and maybe should) instead switch to `latest` being used for
released versions and `main` being used where we use `latest` right now.
That'd also mean we don't have to adjust the CI in the cloud repo.
---
 .github/scripts/generate_image_maps.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py
index d8f910271b..d3ec048409 100644
--- a/.github/scripts/generate_image_maps.py
+++ b/.github/scripts/generate_image_maps.py
@@ -39,12 +39,18 @@ registries = {
     ],
 }
 
+release_branches = ["release", "release-proxy", "release-compute"]
+
 outputs: dict[str, dict[str, list[str]]] = {}
 
-target_tags = [target_tag, "latest"] if branch == "main" else [target_tag]
-target_stages = (
-    ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"]
+target_tags = (
+    [target_tag, "latest"]
+    if branch == "main"
+    else [target_tag, "released"]
+    if branch in release_branches
+    else [target_tag]
 )
+target_stages = ["dev", "prod"] if branch in release_branches else ["dev"]
 
 for component_name, component_images in components.items():
     for stage in target_stages:

From 581bb5d7d5bc8e8e668be0838b1b6035c6fc9bf0 Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Thu, 3 Apr 2025 11:26:35 -0400
Subject: [PATCH 035/140] removed pg_anon setup from compute dockerfile
 (#10960)

## Problem

Removing the `anon` v1 extension in postgres as described in
https://github.com/neondatabase/cloud/issues/22663. This extension is
not built for postgres v17 and is out of date when compared to the
upstream variant which is v2 (we have v1.4).

## Summary of changes

Removed the `anon` v1 extension from the compute docker image

Related to https://github.com/neondatabase/cloud/issues/22663
---
 compute/compute-node.Dockerfile | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 417f5ce6da..e3732e1ed2 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1055,34 +1055,6 @@ RUN  if [ -d pg_embedding-src ]; then \
         make -j $(getconf _NPROCESSORS_ONLN) install; \
     fi
 
-#########################################################################################
-#
-# Layer "pg_anon-build"
-# compile anon extension
-#
-#########################################################################################
-FROM build-deps AS pg_anon-src
-ARG PG_VERSION
-
-# This is an experimental extension, never got to real production.
-# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
-WORKDIR /ext-src
-RUN case "${PG_VERSION:?}" in "v17") \
-    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
-    esac && \
-    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
-    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C .
-
-FROM pg-build AS pg_anon-build
-COPY --from=pg_anon-src /ext-src/ /ext-src/
-WORKDIR /ext-src
-RUN if [ -d pg_anon-src ]; then \
-        cd pg_anon-src && \
-        make -j $(getconf _NPROCESSORS_ONLN) install && \
-        echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \
-    fi
-
 #########################################################################################
 #
 # Layer "pg build with nonroot user and cargo installed"
@@ -1677,7 +1649,6 @@ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/

From 131b32ef48ae4eb1a64b9a066fa0e29346c84f42 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 3 Apr 2025 11:55:22 -0400
Subject: [PATCH 036/140] fix(pageserver): clean up aux files before detaching
 (#11299)

## Problem

Related to https://github.com/neondatabase/cloud/issues/26091 and
https://github.com/neondatabase/cloud/issues/25840

Close https://github.com/neondatabase/neon/issues/11297

Discussion on Slack:
https://neondb.slack.com/archives/C033RQ5SPDH/p1742320666313969

## Summary of changes

* When detaching, scan all aux files within
`sparse_non_inherited_keyspace` in the ancestor timeline and create an
image layer exactly at the ancestor LSN. All scanned keys will map to an
empty value, which is a delete tombstone.
- Note that end_lsn for rewritten delta layers = ancestor_lsn + 1, so
the image layer will have image_end_lsn=end_lsn. With the current
`select_layer` logic, the read path will always first read the image
layer.
* Add a test case.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/http/routes.rs                 |   3 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 105 +++++++++++++++++-
 test_runner/fixtures/pageserver/http.py       |  25 +++++
 .../regress/test_timeline_detach_ancestor.py  |  81 ++++++++++++++
 4 files changed, 210 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index adc38e32e8..8dcb654e59 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3188,7 +3188,8 @@ async fn list_aux_files(
         timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
     );
 
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
     let files = timeline
         .list_aux_files(body.lsn, &ctx, io_concurrency)
         .await?;
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index ca1d81c691..1b0d22dc82 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -2,10 +2,14 @@ use std::collections::HashSet;
 use std::sync::Arc;
 
 use anyhow::Context;
+use bytes::Bytes;
 use http_utils::error::ApiError;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::DetachBehavior;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_compaction::helpers::overlaps_with;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::Tenant;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
+use crate::tenant::storage_layer::{
+    AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
+    ValuesReconstructState,
+};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 #[derive(Debug, thiserror::Error)]
@@ -170,6 +177,92 @@ impl Attempt {
     }
 }
 
+async fn generate_tombstone_image_layer(
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+    ctx: &RequestContext,
+) -> Result<Option<ResidentLayer>, Error> {
+    tracing::info!(
+        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
+    );
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        detached.conf,
+        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
+    );
+    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+    // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+    // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute.
+    let key_range = Key::sparse_non_inherited_keyspace();
+    // avoid generating a "future layer" which will then be removed
+    let image_lsn = ancestor_lsn;
+
+    {
+        let layers = detached.layers.read().await;
+        for layer in layers.all_persistent_layers() {
+            if !layer.is_delta
+                && layer.lsn_range.start == image_lsn
+                && overlaps_with(&key_range, &layer.key_range)
+            {
+                tracing::warn!(
+                    layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files"
+                );
+                return Ok(None);
+            }
+        }
+    }
+
+    let data = ancestor
+        .get_vectored_impl(
+            KeySpace::single(key_range.clone()),
+            image_lsn,
+            &mut reconstruct_state,
+            ctx,
+        )
+        .await
+        .context("failed to retrieve aux keys")
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
+    if !data.is_empty() {
+        // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated
+        // upon compaction but theoretically possible.
+        let mut image_layer_writer = ImageLayerWriter::new(
+            detached.conf,
+            detached.timeline_id,
+            detached.tenant_shard_id,
+            &key_range,
+            image_lsn,
+            ctx,
+        )
+        .await
+        .context("failed to create image layer writer")
+        .map_err(Error::Prepare)?;
+        for key in data.keys() {
+            image_layer_writer
+                .put_image(*key, Bytes::new(), ctx)
+                .await
+                .context("failed to write key")
+                .map_err(|e| Error::launder(e, Error::Prepare))?;
+        }
+        let (desc, path) = image_layer_writer
+            .finish(ctx)
+            .await
+            .context("failed to finish image layer writer for removing the metadata keys")
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        let generated = Layer::finish_creating(detached.conf, detached, desc, &path)
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        detached
+            .remote_client
+            .upload_layer_file(&generated, &detached.cancel)
+            .await
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        tracing::info!(layer=%generated, "wrote image layer");
+        Ok(Some(generated))
+    } else {
+        tracing::info!("no aux keys found in ancestor");
+        Ok(None)
+    }
+}
+
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
     detached: &Arc<Timeline>,
@@ -352,10 +445,16 @@ pub(super) async fn prepare(
 
     // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
     let mut new_layers: Vec<Layer> =
-        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
+        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
+
+    if let Some(tombstone_layer) =
+        generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
+    {
+        new_layers.push(tombstone_layer.into());
+    }
 
     {
-        tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
+        tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
 
         let mut tasks = tokio::task::JoinSet::new();
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 8211da32fe..c2d176bf5a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1192,3 +1192,28 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         log.info(f"Got perf info response code: {res.status_code}")
         self.verbose_error(res)
         return res.json()
+
+    def ingest_aux_files(
+        self,
+        tenant_id: TenantId | TenantShardId,
+        timeline_id: TimelineId,
+        aux_files: dict[str, bytes],
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files",
+            json={
+                "aux_files": aux_files,
+            },
+        )
+        self.verbose_error(res)
+        return res.json()
+
+    def list_aux_files(
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
+    ) -> Any:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files",
+            json={"lsn": str(lsn)},
+        )
+        self.verbose_error(res)
+        return res.json()
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 34c251285f..a71652af8a 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1768,6 +1768,87 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu
     workload_child.validate(env.pageserver.id)
 
 
+def test_timeline_detach_with_aux_files_with_detach_v1(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Validate that "branches do not inherit their parent" is invariant over detach_ancestor.
+
+    Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary.
+    We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent,
+    thereby making parent aux files reappear.
+    """
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    http = env.pageserver.http_client()
+
+    endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
+    lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    endpoint.safe_psql(
+        "SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
+    )
+    lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    endpoint.safe_psql(
+        "SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')"
+    )
+    lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set(
+        []
+    )
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set(
+        ["pg_replslot/test_slot_parent_1/state"]
+    )
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
+        ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
+    )
+
+    # Restore at LSN1
+    branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1)
+    endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant)
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
+
+    # Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore,
+    # but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior
+    # in the future.
+    # TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore
+    # feature before branch detach was introduced because it will clean up the aux files and stop logical replication.
+    endpoint2.safe_psql(
+        "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')"
+    )
+    lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
+        ["pg_replslot/test_slot_restore/state"]
+    )
+
+    print("lsn0=", lsn0)
+    print("lsn1=", lsn1)
+    print("lsn2=", lsn2)
+    print("lsn3=", lsn3)
+    # Detach the restore branch so that main doesn't have any child branches.
+    all_reparented = http.detach_ancestor(
+        env.initial_tenant, branch_timeline_id, detach_behavior="v1"
+    )
+    assert all_reparented == set([])
+
+    # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN.
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
+        ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
+    ), "main branch unaffected"
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
+        ["pg_replslot/test_slot_restore/state"]
+    )
+    assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
+
+
 # TODO:
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.

From 74920d8cd89727042a751a38aba6a318c143365b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 3 Apr 2025 17:35:55 +0100
Subject: [PATCH 037/140] storcon: notify compute if correct observed state was
 refreshed (#11342)

## Problem

Previously, if the observed state was refreshed and matching the intent,
we wouldn't send
a compute notification. This is unsafe. There's no guarantee that the
location landed on the
pageserver _and_ a compute notification for it was delivered.

See
https://github.com/neondatabase/neon/issues/11291#issuecomment-2743205411
for one such example.

## Summary of changes

Add a reproducer and notify the compute if the correct observed state
required a refresh.

Closes https://github.com/neondatabase/neon/issues/11291
---
 storage_controller/src/reconciler.rs          |  22 +++-
 test_runner/regress/test_compatibility.py     |   2 +
 .../regress/test_storage_controller.py        | 118 ++++++++++++++++++
 3 files changed, 136 insertions(+), 6 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 9f6f385dc9..b03a6dae04 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -686,6 +686,8 @@ impl Reconciler {
                 .await?,
         );
 
+        pausable_failpoint!("reconciler-live-migrate-post-generation-inc");
+
         let dest_conf = build_location_config(
             &self.shard,
             &self.config,
@@ -760,7 +762,9 @@ impl Reconciler {
         Ok(())
     }
 
-    async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
+    /// Returns true if the observed state of the attached location was refreshed
+    /// and false otherwise.
+    async fn maybe_refresh_observed(&mut self) -> Result<bool, ReconcileError> {
         // If the attached node has uncertain state, read it from the pageserver before proceeding: this
         // is important to avoid spurious generation increments.
         //
@@ -770,7 +774,7 @@ impl Reconciler {
 
         let Some(attached_node) = self.intent.attached.as_ref() else {
             // Nothing to do
-            return Ok(());
+            return Ok(false);
         };
 
         if matches!(
@@ -815,7 +819,7 @@ impl Reconciler {
             }
         }
 
-        Ok(())
+        Ok(true)
     }
 
     /// Reconciling a tenant makes API calls to pageservers until the observed state
@@ -831,7 +835,7 @@ impl Reconciler {
     /// state where it still requires later reconciliation.
     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
         // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
-        self.maybe_refresh_observed().await?;
+        let refreshed = self.maybe_refresh_observed().await?;
 
         // Special case: live migration
         self.maybe_live_migrate().await?;
@@ -855,8 +859,14 @@ impl Reconciler {
             );
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
-                    // Nothing to do
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
+                    if refreshed {
+                        tracing::info!(
+                            node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute.");
+                        self.compute_notify().await?;
+                    } else {
+                        // Nothing to do
+                        tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.");
+                    }
                 }
                 observed => {
                     // In all cases other than a matching observed configuration, we will
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index f61778e4c5..fcc2e7006f 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -249,6 +249,7 @@ def test_forward_compatibility(
     top_output_dir: Path,
     pg_version: PgVersion,
     compatibility_snapshot_dir: Path,
+    compute_reconfigure_listener: ComputeReconfigure,
 ):
     """
     Test that the old binaries can read new data
@@ -257,6 +258,7 @@ def test_forward_compatibility(
         os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
     )
 
+    neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
     neon_env_builder.test_may_use_compatibility_snapshot_binaries = True
 
     try:
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 35a75ca607..b9344f2fb4 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4176,3 +4176,121 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder,
         )
     else:
         assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == []
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_storage_controller_migrate_with_pageserver_restart(
+    neon_env_builder: NeonEnvBuilder, make_httpserver
+):
+    """
+    Test that live migrations which fail right after incrementing the generation
+    due to the destination going offline eventually send a compute notification
+    after the destination re-attaches.
+    """
+    neon_env_builder.num_pageservers = 2
+
+    neon_env_builder.storage_controller_config = {
+        # Disable transitions to offline
+        "max_offline": "600s",
+        "use_local_compute_notifications": False,
+    }
+
+    neon_env_builder.control_plane_hooks_api = (
+        f"http://{make_httpserver.host}:{make_httpserver.port}/"
+    )
+
+    notifications = []
+
+    def notify(request: Request):
+        log.info(f"Received notify-attach: {request}")
+        notifications.append(request.json)
+
+    make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(notify)
+
+    env = neon_env_builder.init_start()
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Call to node.*management API failed.*",
+            ".*Call to node.*management API still failed.*",
+            ".*Reconcile error.*",
+            ".*request.*PUT.*migrate.*",
+        ]
+    )
+
+    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
+    env.storage_controller.reconcile_until_idle()
+
+    initial_desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
+    log.info(f"{initial_desc=}")
+    primary = env.get_pageserver(initial_desc["node_attached"])
+    secondary = env.get_pageserver(initial_desc["node_secondary"][0])
+
+    # Pause the migration after incrementing the generation in the database
+    env.storage_controller.configure_failpoints(
+        ("reconciler-live-migrate-post-generation-inc", "pause")
+    )
+
+    tenant_shard_id = TenantShardId(env.initial_tenant, 0, 0)
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            migrate_fut = executor.submit(
+                env.storage_controller.tenant_shard_migrate,
+                tenant_shard_id,
+                secondary.id,
+                config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True),
+            )
+
+            def has_hit_migration_failpoint():
+                expr = "at failpoint reconciler-live-migrate-post-generation-inc"
+                log.info(expr)
+                assert env.storage_controller.log_contains(expr)
+
+            wait_until(has_hit_migration_failpoint)
+
+            secondary.stop()
+
+            # Eventually migration completes
+            env.storage_controller.configure_failpoints(
+                ("reconciler-live-migrate-post-generation-inc", "off")
+            )
+            try:
+                migrate_fut.result()
+            except StorageControllerApiException as err:
+                log.info(f"Migration failed: {err}")
+    except:
+        env.storage_controller.configure_failpoints(
+            ("reconciler-live-migrate-post-generation-inc", "off")
+        )
+        raise
+
+    def process_migration_result():
+        dump = env.storage_controller.tenant_shard_dump()
+        observed = dump[0]["observed"]["locations"]
+
+        log.info(f"{observed=} primary={primary.id} secondary={secondary.id}")
+
+        assert observed[str(primary.id)]["conf"]["mode"] == "AttachedStale"
+        assert observed[str(secondary.id)]["conf"] is None
+
+    wait_until(process_migration_result)
+
+    # Start and wait for re-attach to be processed
+    secondary.start()
+    env.storage_controller.poll_node_status(
+        secondary.id,
+        desired_availability=PageserverAvailability.ACTIVE,
+        desired_scheduling_policy=None,
+        max_attempts=10,
+        backoff=1,
+    )
+
+    env.storage_controller.reconcile_until_idle()
+
+    assert notifications[-1] == {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
+        "shards": [{"node_id": int(secondary.id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
+    }

From 109c54a300816901055136a92e490843602d17f2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:18:37 -0400
Subject: [PATCH 038/140] fix(pageserver): avoid gc-compaction triggering
 circuit breaker (#11403)

## Problem

There are some cases where traditional gc might collect some layer files
causing gc-compaction cannot read the full history of the key. This
needs to be resolved in the long-term by improving the compaction
process. For now, let's simply avoid such errors triggering the circuit
breaker.

## Summary of changes

* Move the place where we trigger the circuit breaker. We only trigger
it during compactions other than L0 compactions. We added the trigger a
year ago due to file cleanup concerns in image layer compaction.
* For gc-compaction, only return errors to the upper
compaction_iteration if it's a shutdown error. Otherwise, just log it
and skip the compaction for a key range.

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 50 +++++++++++++++-----
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2276ed428b..9693d232ee 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -279,7 +279,7 @@ impl GcCompactionQueue {
             gc_compaction_ratio_percent: u64,
         ) -> bool {
             const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB
-            if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT {
+            if l1_size + l2_size >= AUTO_TRIGGER_LIMIT {
                 // Do not auto-trigger when physical size >= 150GB
                 return false;
             }
@@ -350,6 +350,11 @@ impl GcCompactionQueue {
         }
     }
 
+    fn clear_running_job(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.running = None;
+    }
+
     async fn handle_sub_compaction(
         &self,
         id: GcCompactionJobId,
@@ -361,12 +366,20 @@ impl GcCompactionQueue {
         info!(
             "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
         );
-        let jobs = timeline
+        let res = timeline
             .gc_compaction_split_jobs(
                 GcCompactJob::from_compact_options(options.clone()),
                 options.sub_compaction_max_job_size_mb,
             )
-            .await?;
+            .await;
+        let jobs = match res {
+            Ok(jobs) => jobs,
+            Err(err) => {
+                warn!("cannot split gc-compaction jobs: {}, unblocked gc", err);
+                self.notify_and_unblock(id);
+                return Err(err);
+            }
+        };
         if jobs.is_empty() {
             info!("no jobs to run, skipping scheduled compaction task");
             self.notify_and_unblock(id);
@@ -446,7 +459,18 @@ impl GcCompactionQueue {
         if let Err(err) = &res {
             log_compaction_error(err, None, cancel.is_cancelled());
         }
-        res
+        match res {
+            Ok(res) => Ok(res),
+            Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
+            Err(_) => {
+                // There are some cases where traditional gc might collect some layer
+                // files causing gc-compaction cannot read the full history of the key.
+                // This needs to be resolved in the long-term by improving the compaction
+                // process. For now, let's simply avoid such errors triggering the
+                // circuit breaker.
+                Ok(CompactionOutcome::Skipped)
+            }
+        }
     }
 
     async fn iteration_inner(
@@ -512,9 +536,16 @@ impl GcCompactionQueue {
                         let mut guard = self.inner.lock().unwrap();
                         guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                     }
-                    let compaction_result =
-                        timeline.compact_with_options(cancel, options, ctx).await?;
-                    self.notify_and_unblock(id);
+                    let res = timeline.compact_with_options(cancel, options, ctx).await;
+                    let compaction_result = match res {
+                        Ok(res) => res,
+                        Err(err) => {
+                            warn!(%err, "failed to run gc-compaction, gc unblocked");
+                            self.notify_and_unblock(id);
+                            self.clear_running_job();
+                            return Err(err);
+                        }
+                    };
                     if compaction_result == CompactionOutcome::YieldForL0 {
                         yield_for_l0 = true;
                     }
@@ -553,10 +584,7 @@ impl GcCompactionQueue {
                 }
             }
         }
-        {
-            let mut guard = self.inner.lock().unwrap();
-            guard.running = None;
-        }
+        self.clear_running_job();
         Ok(if yield_for_l0 {
             tracing::info!("give up gc-compaction: yield for L0 compaction");
             CompactionOutcome::YieldForL0

From bfc767d60d05dbe122118b4feffc6abd4f7a723d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:49:45 -0400
Subject: [PATCH 039/140] fix(test): wait for shard split complete for
 test_lsn_lease_storcon (#11436)

## Problem

close https://github.com/neondatabase/neon/issues/11397
ref https://github.com/neondatabase/cloud/issues/23667

## Summary of changes

We need to wait until the shard split is complete, otherwise it will
print warning like waiting for shard split exclusive lock for 30s.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_tenant_size.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 0cb22905b0..190dd914ee 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -776,6 +776,7 @@ def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder):
         env.initial_tenant, env.initial_timeline, last_flush_lsn
     )
     env.storage_controller.tenant_shard_split(env.initial_tenant, 8)
+    env.storage_controller.reconcile_until_idle(timeout_secs=120)
     # TODO: do we preserve LSN leases across shard splits?
     env.storage_controller.pageserver_api().timeline_lsn_lease(
         env.initial_tenant, env.initial_timeline, last_flush_lsn

From 9db63fea7a9c67233255cfcf687633b59b857d8a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 3 Apr 2025 18:56:51 +0100
Subject: [PATCH 040/140] pageserver: optionally export perf traces in OTEL
 format (#11140)

Based on https://github.com/neondatabase/neon/pull/11139

## Problem

We want to export performance traces from the pageserver in OTEL format.
End goal is to see them in Grafana.

## Summary of changes

https://github.com/neondatabase/neon/pull/11139 introduces the
infrastructure required to run the otel collector alongside the
pageserver.

### Design

Requirements:
1. We'd like to avoid implementing our own performance tracing stack if
possible and use the `tracing` crate if possible.
2. Ideally, we'd like zero overhead of a sampling rate of zero and be a
be able to change the tracing config for a tenant on the fly.
3. We should leave the current span hierarchy intact. This includes
adding perf traces without modifying existing tracing.

To satisfy (3) (and (2) in part) a separate span hierarchy is used.
`RequestContext` gains an optional `perf_span` member
that's only set when the request was chosen by sampling. All perf span
related methods added to `RequestContext` are no-ops for requests that
are not sampled.

This on its own is not enough for (3), so performance spans use a
separate tracing subscriber. The `tracing` crate doesn't have great
support for this, so there's a fair amount of boilerplate to override
the subscriber at all points of the perf span lifecycle.

### Perf Impact

[Periodic
pagebench](https://neonprod.grafana.net/d/ddqtbfykfqfi8d/e904990?orgId=1&from=2025-02-08T14:15:59.362Z&to=2025-03-10T14:15:59.362Z&timezone=utc)
shows no statistically significant regression with a sample ratio of 0.
There's an annotation on the dashboard on 2025-03-06.

### Overview of changes:
1. Clean up the `RequestContext` API a bit. Namely, get rid of the
`RequestContext::extend` API and use the builder instead.
2. Add pageserver level configs for tracing: sampling ratio, otel
endpoint, etc.
3. Introduce some perf span tracking utilities and expose them via
`RequestContext`. We add a `tracing::Span` wrapper to be used for perf
spans and a `tracing::Instrumented` equivalent for it. See doc comments
for reason.
4. Set up OTEL tracing infra according to configuration. A separate
runtime is used for the collector.
5. Add perf traces to the read path.

## Refs

- epic https://github.com/neondatabase/neon/issues/9873

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Cargo.lock                                    |   2 +
 libs/pageserver_api/Cargo.toml                |   1 +
 libs/pageserver_api/src/config.rs             |  50 +++++
 libs/tracing-utils/Cargo.toml                 |   1 +
 libs/tracing-utils/src/lib.rs                 |   2 +-
 libs/tracing-utils/src/perf_span.rs           | 153 ++++++++++++++
 pageserver/src/bin/pageserver.rs              |  41 +++-
 pageserver/src/config.rs                      |  15 ++
 pageserver/src/context.rs                     | 186 ++++++++++++++----
 pageserver/src/http/routes.rs                 |  16 +-
 pageserver/src/lib.rs                         |   3 +
 pageserver/src/page_service.rs                | 124 ++++++++++--
 pageserver/src/pgdatadir_mapping.rs           |  73 +++++--
 pageserver/src/tenant.rs                      |   4 +-
 pageserver/src/tenant/storage_layer.rs        |  36 +++-
 .../src/tenant/storage_layer/delta_layer.rs   |   8 +-
 .../src/tenant/storage_layer/image_layer.rs   |   4 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   4 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  68 +++++--
 .../src/tenant/storage_layer/layer/tests.rs   |  17 +-
 pageserver/src/tenant/timeline.rs             | 132 +++++++++++--
 pageserver/src/tenant/timeline/compaction.rs  |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  34 ++++
 .../fixtures/pageserver/allowed_errors.py     |   1 +
 ...er_max_throughput_getpage_at_latest_lsn.py |  11 ++
 25 files changed, 855 insertions(+), 135 deletions(-)
 create mode 100644 libs/tracing-utils/src/perf_span.rs

diff --git a/Cargo.lock b/Cargo.lock
index 194ad90d52..03a376cdae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4329,6 +4329,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "thiserror 1.0.69",
+ "tracing-utils",
  "utils",
 ]
 
@@ -7603,6 +7604,7 @@ dependencies = [
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
  "opentelemetry_sdk",
+ "pin-project-lite",
  "tokio",
  "tracing",
  "tracing-opentelemetry",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 87dfdfb5ec..688e9de6e7 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -34,6 +34,7 @@ postgres_backend.workspace = true
 nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
+tracing-utils.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 47c3136113..66a02b87b0 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -134,6 +134,7 @@ pub struct ConfigToml {
     pub load_previous_heatmap: Option<bool>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub generate_unarchival_heatmap: Option<bool>,
+    pub tracing: Option<Tracing>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -191,6 +192,54 @@ pub enum GetVectoredConcurrentIo {
     SidecarTask,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct Ratio {
+    pub numerator: usize,
+    pub denominator: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct OtelExporterConfig {
+    pub endpoint: String,
+    pub protocol: OtelExporterProtocol,
+    #[serde(with = "humantime_serde")]
+    pub timeout: Duration,
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum OtelExporterProtocol {
+    Grpc,
+    HttpBinary,
+    HttpJson,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct Tracing {
+    pub sampling_ratio: Ratio,
+    pub export_config: OtelExporterConfig,
+}
+
+impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
+    fn from(val: &OtelExporterConfig) -> Self {
+        tracing_utils::ExportConfig {
+            endpoint: Some(val.endpoint.clone()),
+            protocol: val.protocol.into(),
+            timeout: val.timeout,
+        }
+    }
+}
+
+impl From<OtelExporterProtocol> for tracing_utils::Protocol {
+    fn from(val: OtelExporterProtocol) -> Self {
+        match val {
+            OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc,
+            OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson,
+            OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary,
+        }
+    }
+}
+
 pub mod statvfs {
     pub mod mock {
         #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -537,6 +586,7 @@ impl Default for ConfigToml {
             validate_wal_contiguity: None,
             load_previous_heatmap: None,
             generate_unarchival_heatmap: None,
+            tracing: None,
         }
     }
 }
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 60637d5b24..49a6055b1e 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,6 +14,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+pin-project-lite.workspace = true
 
 [dev-dependencies]
 tracing-subscriber.workspace = true    # For examples in docs
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
index 74992a7d03..0893aa173b 100644
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -31,10 +31,10 @@
 //!         .init();
 //! }
 //! ```
-#![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 pub mod http;
+pub mod perf_span;
 
 use opentelemetry::KeyValue;
 use opentelemetry::trace::TracerProvider;
diff --git a/libs/tracing-utils/src/perf_span.rs b/libs/tracing-utils/src/perf_span.rs
new file mode 100644
index 0000000000..f2ca76a816
--- /dev/null
+++ b/libs/tracing-utils/src/perf_span.rs
@@ -0,0 +1,153 @@
+//! Crutch module to work around tracing infrastructure deficiencies
+//!
+//! We wish to collect granular request spans without impacting performance
+//! by much. Ideally, we should have zero overhead for a sampling rate of 0.
+//!
+//! The approach taken by the pageserver crate is to use a completely different
+//! span hierarchy for the performance spans. Spans are explicitly stored in
+//! the request context and use a different [`tracing::Subscriber`] in order
+//! to avoid expensive filtering.
+//!
+//! [`tracing::Span`] instances record their [`tracing::Dispatch`] and, implcitly,
+//! their [`tracing::Subscriber`] at creation time. However, upon exiting the span,
+//! the global default [`tracing::Dispatch`] is used. This is problematic if one
+//! wishes to juggle different subscribers.
+//!
+//! In order to work around this, this module provides a [`PerfSpan`] type which
+//! wraps a [`Span`] and sets the default subscriber when exiting the span. This
+//! achieves the correct routing.
+//!
+//! There's also a modified version of [`tracing::Instrument`] which works with
+//! [`PerfSpan`].
+
+use core::{
+    future::Future,
+    marker::Sized,
+    mem::ManuallyDrop,
+    pin::Pin,
+    task::{Context, Poll},
+};
+use pin_project_lite::pin_project;
+use tracing::{Dispatch, field, span::Span};
+
+#[derive(Debug, Clone)]
+pub struct PerfSpan {
+    inner: ManuallyDrop<Span>,
+    dispatch: Dispatch,
+}
+
+#[must_use = "once a span has been entered, it should be exited"]
+pub struct PerfSpanEntered<'a> {
+    span: &'a PerfSpan,
+}
+
+impl PerfSpan {
+    pub fn new(span: Span, dispatch: Dispatch) -> Self {
+        Self {
+            inner: ManuallyDrop::new(span),
+            dispatch,
+        }
+    }
+
+    pub fn record<Q: field::AsField + ?Sized, V: field::Value>(
+        &self,
+        field: &Q,
+        value: V,
+    ) -> &Self {
+        self.inner.record(field, value);
+        self
+    }
+
+    pub fn enter(&self) -> PerfSpanEntered {
+        if let Some(ref id) = self.inner.id() {
+            self.dispatch.enter(id);
+        }
+
+        PerfSpanEntered { span: self }
+    }
+
+    pub fn inner(&self) -> &Span {
+        &self.inner
+    }
+}
+
+impl Drop for PerfSpan {
+    fn drop(&mut self) {
+        // Bring the desired dispatch into scope before explicitly calling
+        // the span destructor. This routes the span exit to the correct
+        // [`tracing::Subscriber`].
+        let _dispatch_guard = tracing::dispatcher::set_default(&self.dispatch);
+        // SAFETY: ManuallyDrop in Drop implementation
+        unsafe { ManuallyDrop::drop(&mut self.inner) }
+    }
+}
+
+impl Drop for PerfSpanEntered<'_> {
+    fn drop(&mut self) {
+        assert!(self.span.inner.id().is_some());
+
+        let _dispatch_guard = tracing::dispatcher::set_default(&self.span.dispatch);
+        self.span.dispatch.exit(&self.span.inner.id().unwrap());
+    }
+}
+
+pub trait PerfInstrument: Sized {
+    fn instrument(self, span: PerfSpan) -> PerfInstrumented<Self> {
+        PerfInstrumented {
+            inner: ManuallyDrop::new(self),
+            span,
+        }
+    }
+}
+
+pin_project! {
+    #[project = PerfInstrumentedProj]
+    #[derive(Debug, Clone)]
+    #[must_use = "futures do nothing unless you `.await` or poll them"]
+    pub struct PerfInstrumented<T> {
+        // `ManuallyDrop` is used here to to enter instrument `Drop` by entering
+        // `Span` and executing `ManuallyDrop::drop`.
+        #[pin]
+        inner: ManuallyDrop<T>,
+        span: PerfSpan,
+    }
+
+    impl<T> PinnedDrop for PerfInstrumented<T> {
+        fn drop(this: Pin<&mut Self>) {
+            let this = this.project();
+            let _enter = this.span.enter();
+            // SAFETY: 1. `Pin::get_unchecked_mut()` is safe, because this isn't
+            //             different from wrapping `T` in `Option` and calling
+            //             `Pin::set(&mut this.inner, None)`, except avoiding
+            //             additional memory overhead.
+            //         2. `ManuallyDrop::drop()` is safe, because
+            //            `PinnedDrop::drop()` is guaranteed to be called only
+            //            once.
+            unsafe { ManuallyDrop::drop(this.inner.get_unchecked_mut()) }
+        }
+    }
+}
+
+impl<'a, T> PerfInstrumentedProj<'a, T> {
+    /// Get a mutable reference to the [`Span`] a pinned mutable reference to
+    /// the wrapped type.
+    fn span_and_inner_pin_mut(self) -> (&'a mut PerfSpan, Pin<&'a mut T>) {
+        // SAFETY: As long as `ManuallyDrop<T>` does not move, `T` won't move
+        //         and `inner` is valid, because `ManuallyDrop::drop` is called
+        //         only inside `Drop` of the `Instrumented`.
+        let inner = unsafe { self.inner.map_unchecked_mut(|v| &mut **v) };
+        (self.span, inner)
+    }
+}
+
+impl<T: Future> Future for PerfInstrumented<T> {
+    type Output = T::Output;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let (span, inner) = self.project().span_and_inner_pin_mut();
+        let _enter = span.enter();
+        inner.poll(cx)
+    }
+}
+
+impl<T: Sized> PerfInstrument for T {}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 4cfc0c24f8..a575904efa 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -35,6 +35,7 @@ use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use tracing_utils::OtelGuard;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
 use utils::logging::TracingErrorLayerEnablement;
@@ -118,6 +119,21 @@ fn main() -> anyhow::Result<()> {
         logging::Output::Stdout,
     )?;
 
+    let otel_enablement = match &conf.tracing {
+        Some(cfg) => tracing_utils::OtelEnablement::Enabled {
+            service_name: "pageserver".to_string(),
+            export_config: (&cfg.export_config).into(),
+            runtime: *COMPUTE_REQUEST_RUNTIME,
+        },
+        None => tracing_utils::OtelEnablement::Disabled,
+    };
+
+    let otel_guard = tracing_utils::init_performance_tracing(otel_enablement);
+
+    if otel_guard.is_some() {
+        info!(?conf.tracing, "starting with OTEL tracing enabled");
+    }
+
     // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
     // disarming this hook on pageserver, because we never tear down tracing.
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -191,7 +207,7 @@ fn main() -> anyhow::Result<()> {
     tracing::info!("Initializing page_cache...");
     page_cache::init(conf.page_cache_size);
 
-    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
+    start_pageserver(launch_ts, conf, otel_guard).context("Failed to start pageserver")?;
 
     scenario.teardown();
     Ok(())
@@ -290,6 +306,7 @@ fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
 fn start_pageserver(
     launch_ts: &'static LaunchTimestamp,
     conf: &'static PageServerConf,
+    otel_guard: Option<OtelGuard>,
 ) -> anyhow::Result<()> {
     // Monotonic time for later calculating startup duration
     let started_startup_at = Instant::now();
@@ -675,13 +692,21 @@ fn start_pageserver(
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone());
+    let page_service = page_service::spawn(
+        conf,
+        tenant_manager.clone(),
+        pg_auth,
+        perf_trace_dispatch,
+        {
+            let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
+            pageserver_listener
+                .set_nonblocking(true)
+                .context("set listener to nonblocking")?;
+            tokio::net::TcpListener::from_std(pageserver_listener)
+                .context("create tokio listener")?
+        },
+    );
 
     // All started up! Now just sit and wait for shutdown signal.
     BACKGROUND_RUNTIME.block_on(async move {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c336f22f8e..d9a5f8c381 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -215,6 +215,8 @@ pub struct PageServerConf {
 
     /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
     pub generate_unarchival_heatmap: bool,
+
+    pub tracing: Option<pageserver_api::config::Tracing>,
 }
 
 /// Token for authentication to safekeepers
@@ -386,6 +388,7 @@ impl PageServerConf {
             validate_wal_contiguity,
             load_previous_heatmap,
             generate_unarchival_heatmap,
+            tracing,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -435,6 +438,7 @@ impl PageServerConf {
             wal_receiver_protocol,
             page_service_pipelining,
             get_vectored_concurrent_io,
+            tracing,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
@@ -506,6 +510,17 @@ impl PageServerConf {
             );
         }
 
+        if let Some(tracing_config) = conf.tracing.as_ref() {
+            let ratio = &tracing_config.sampling_ratio;
+            ensure!(
+                ratio.denominator != 0 && ratio.denominator >= ratio.numerator,
+                format!(
+                    "Invalid sampling ratio: {}/{}",
+                    ratio.numerator, ratio.denominator
+                )
+            );
+        }
+
         IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
             .map_err(anyhow::Error::msg)
             .with_context(|| {
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index d2caf030df..279d2daf75 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -100,6 +100,12 @@ use crate::{
     task_mgr::TaskKind,
     tenant::Timeline,
 };
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use std::future::Future;
+use tracing_utils::perf_span::{PerfInstrument, PerfSpan};
+
+use tracing::{Dispatch, Span};
 
 // The main structure of this module, see module-level comment.
 pub struct RequestContext {
@@ -109,6 +115,8 @@ pub struct RequestContext {
     page_content_kind: PageContentKind,
     read_path_debug: bool,
     scope: Scope,
+    perf_span: Option<PerfSpan>,
+    perf_span_dispatch: Option<Dispatch>,
 }
 
 #[derive(Clone)]
@@ -263,22 +271,15 @@ impl RequestContextBuilder {
                 page_content_kind: PageContentKind::Unknown,
                 read_path_debug: false,
                 scope: Scope::new_global(),
+                perf_span: None,
+                perf_span_dispatch: None,
             },
         }
     }
 
-    pub fn extend(original: &RequestContext) -> Self {
+    pub fn from(original: &RequestContext) -> Self {
         Self {
-            // This is like a Copy, but avoid implementing Copy because ordinary users of
-            // RequestContext should always move or ref it.
-            inner: RequestContext {
-                task_kind: original.task_kind,
-                download_behavior: original.download_behavior,
-                access_stats_behavior: original.access_stats_behavior,
-                page_content_kind: original.page_content_kind,
-                read_path_debug: original.read_path_debug,
-                scope: original.scope.clone(),
-            },
+            inner: original.clone(),
         }
     }
 
@@ -316,12 +317,74 @@ impl RequestContextBuilder {
         self
     }
 
-    pub fn build(self) -> RequestContext {
+    pub(crate) fn perf_span_dispatch(mut self, dispatch: Option<Dispatch>) -> Self {
+        self.inner.perf_span_dispatch = dispatch;
+        self
+    }
+
+    pub fn root_perf_span<Fn>(mut self, make_span: Fn) -> Self
+    where
+        Fn: FnOnce() -> Span,
+    {
+        assert!(self.inner.perf_span.is_none());
+        assert!(self.inner.perf_span_dispatch.is_some());
+
+        let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap();
+        let new_span = tracing::dispatcher::with_default(dispatcher, make_span);
+
+        self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone()));
+
+        self
+    }
+
+    pub fn perf_span<Fn>(mut self, make_span: Fn) -> Self
+    where
+        Fn: FnOnce(&Span) -> Span,
+    {
+        if let Some(ref perf_span) = self.inner.perf_span {
+            assert!(self.inner.perf_span_dispatch.is_some());
+            let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap();
+
+            let new_span =
+                tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner()));
+
+            self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone()));
+        }
+
+        self
+    }
+
+    pub fn root(self) -> RequestContext {
+        self.inner
+    }
+
+    pub fn attached_child(self) -> RequestContext {
+        self.inner
+    }
+
+    pub fn detached_child(self) -> RequestContext {
         self.inner
     }
 }
 
 impl RequestContext {
+    /// Private clone implementation
+    ///
+    /// Callers should use the [`RequestContextBuilder`] or child spaning APIs of
+    /// [`RequestContext`].
+    fn clone(&self) -> Self {
+        Self {
+            task_kind: self.task_kind,
+            download_behavior: self.download_behavior,
+            access_stats_behavior: self.access_stats_behavior,
+            page_content_kind: self.page_content_kind,
+            read_path_debug: self.read_path_debug,
+            scope: self.scope.clone(),
+            perf_span: self.perf_span.clone(),
+            perf_span_dispatch: self.perf_span_dispatch.clone(),
+        }
+    }
+
     /// Create a new RequestContext that has no parent.
     ///
     /// The function is called `new` because, once we add children
@@ -337,7 +400,7 @@ impl RequestContext {
     pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
         RequestContextBuilder::new(task_kind)
             .download_behavior(download_behavior)
-            .build()
+            .root()
     }
 
     /// Create a detached child context for a task that may outlive `self`.
@@ -358,7 +421,10 @@ impl RequestContext {
     ///
     /// We could make new calls to this function fail if `self` is already canceled.
     pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        self.child_impl(task_kind, download_behavior)
+        RequestContextBuilder::from(self)
+            .task_kind(task_kind)
+            .download_behavior(download_behavior)
+            .detached_child()
     }
 
     /// Create a child of context `self` for a task that shall not outlive `self`.
@@ -382,7 +448,7 @@ impl RequestContext {
     /// The method to wait for child tasks would return an error, indicating
     /// that the child task was not started because the context was canceled.
     pub fn attached_child(&self) -> Self {
-        self.child_impl(self.task_kind(), self.download_behavior())
+        RequestContextBuilder::from(self).attached_child()
     }
 
     /// Use this function when you should be creating a child context using
@@ -397,17 +463,10 @@ impl RequestContext {
         Self::new(task_kind, download_behavior)
     }
 
-    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContextBuilder::extend(self)
-            .task_kind(task_kind)
-            .download_behavior(download_behavior)
-            .build()
-    }
-
     pub fn with_scope_timeline(&self, timeline: &Arc<Timeline>) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
             .scope(Scope::new_timeline(timeline))
-            .build()
+            .attached_child()
     }
 
     pub(crate) fn with_scope_page_service_pagestream(
@@ -416,9 +475,9 @@ impl RequestContext {
             crate::page_service::TenantManagerTypes,
         >,
     ) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
             .scope(Scope::new_page_service_pagestream(timeline_handle))
-            .build()
+            .attached_child()
     }
 
     pub fn with_scope_secondary_timeline(
@@ -426,28 +485,30 @@ impl RequestContext {
         tenant_shard_id: &TenantShardId,
         timeline_id: &TimelineId,
     ) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
             .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id))
-            .build()
+            .attached_child()
     }
 
     pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
             .scope(Scope::new_secondary_tenant(tenant_shard_id))
-            .build()
+            .attached_child()
     }
 
     #[cfg(test)]
     pub fn with_scope_unit_test(&self) -> Self {
-        RequestContextBuilder::new(TaskKind::UnitTest)
+        RequestContextBuilder::from(self)
+            .task_kind(TaskKind::UnitTest)
             .scope(Scope::new_unit_test())
-            .build()
+            .attached_child()
     }
 
     pub fn with_scope_debug_tools(&self) -> Self {
-        RequestContextBuilder::new(TaskKind::DebugTool)
+        RequestContextBuilder::from(self)
+            .task_kind(TaskKind::DebugTool)
             .scope(Scope::new_debug_tools())
-            .build()
+            .attached_child()
     }
 
     pub fn task_kind(&self) -> TaskKind {
@@ -504,4 +565,61 @@ impl RequestContext {
             Scope::DebugTools { io_size_metrics } => io_size_metrics,
         }
     }
+
+    pub(crate) fn perf_follows_from(&self, from: &RequestContext) {
+        if let (Some(span), Some(from_span)) = (&self.perf_span, &from.perf_span) {
+            span.inner().follows_from(from_span.inner());
+        }
+    }
+
+    pub(crate) fn perf_span_record<
+        Q: tracing::field::AsField + ?Sized,
+        V: tracing::field::Value,
+    >(
+        &self,
+        field: &Q,
+        value: V,
+    ) {
+        if let Some(span) = &self.perf_span {
+            span.record(field, value);
+        }
+    }
+
+    pub(crate) fn has_perf_span(&self) -> bool {
+        self.perf_span.is_some()
+    }
 }
+
+/// [`Future`] extension trait that allow for creating performance
+/// spans on sampled requests
+pub(crate) trait PerfInstrumentFutureExt<'a>: Future + Send {
+    /// Instrument this future with a new performance span when the
+    /// provided request context indicates the originator request
+    /// was sampled. Otherwise, just box the future and return it as is.
+    fn maybe_perf_instrument<Fn>(
+        self,
+        ctx: &RequestContext,
+        make_span: Fn,
+    ) -> BoxFuture<'a, Self::Output>
+    where
+        Self: Sized + 'a,
+        Fn: FnOnce(&Span) -> Span,
+    {
+        match &ctx.perf_span {
+            Some(perf_span) => {
+                assert!(ctx.perf_span_dispatch.is_some());
+                let dispatcher = ctx.perf_span_dispatch.as_ref().unwrap();
+
+                let new_span =
+                    tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner()));
+
+                let new_perf_span = PerfSpan::new(new_span, dispatcher.clone());
+                self.instrument(new_perf_span).boxed()
+            }
+            None => self.boxed(),
+        }
+    }
+}
+
+// Implement the trait for all types that satisfy the trait bounds
+impl<'a, T: Future + Send + 'a> PerfInstrumentFutureExt<'a> for T {}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8dcb654e59..cf67dc596a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2697,11 +2697,12 @@ async fn getpage_at_lsn_handler_inner(
     let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;
 
     async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        // Enable read path debugging
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true)
-        .scope(context::Scope::new_timeline(&timeline)).build();
+        let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest)
+            .download_behavior(DownloadBehavior::Download)
+            .scope(context::Scope::new_timeline(&timeline))
+            .read_path_debug(true)
+            .root();
 
         // Use last_record_lsn if no lsn is provided
         let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
@@ -3433,14 +3434,15 @@ async fn put_tenant_timeline_import_wal(
 
     check_permission(&request, Some(tenant_id))?;
 
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
     let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
     async move {
         let state = get_state(&request);
 
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
-        let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build();
+        let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest)
+            .download_behavior(DownloadBehavior::Warn)
+            .scope(context::Scope::new_timeline(&timeline))
+            .root();
 
         let mut body = StreamReader::new(request.into_body().map(|res| {
             res.map_err(|error| {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 8373d0bd87..bda218444d 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -55,6 +55,9 @@ pub const DEFAULT_PG_VERSION: u32 = 16;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
 
+// Target used for performance traces.
+pub const PERF_TRACE_TARGET: &str = "P";
+
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
 pub use crate::metrics::preinitialize_metrics;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 2ed3e0ecb0..3ebd6d8506 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -9,6 +9,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};
 use std::{io, str};
 
+use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, bail};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
@@ -17,7 +18,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedExecutionStrategy, Tracing,
 };
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
@@ -36,6 +37,7 @@ use postgres_ffi::BLCKSZ;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use pq_proto::framed::ConnectionError;
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
+use rand::Rng;
 use strum_macros::IntoStaticStr;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
@@ -53,7 +55,9 @@ use utils::sync::spsc_fold;
 use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
+};
 use crate::metrics::{
     self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
     TimelineMetrics,
@@ -100,6 +104,7 @@ pub fn spawn(
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
     pg_auth: Option<Arc<SwappableJwtAuth>>,
+    perf_trace_dispatch: Option<Dispatch>,
     tcp_listener: tokio::net::TcpListener,
 ) -> Listener {
     let cancel = CancellationToken::new();
@@ -117,6 +122,7 @@ pub fn spawn(
             conf,
             tenant_manager,
             pg_auth,
+            perf_trace_dispatch,
             tcp_listener,
             conf.pg_auth_type,
             conf.page_service_pipelining.clone(),
@@ -173,6 +179,7 @@ pub async fn libpq_listener_main(
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    perf_trace_dispatch: Option<Dispatch>,
     listener: tokio::net::TcpListener,
     auth_type: AuthType,
     pipelining_config: PageServicePipeliningConfig,
@@ -205,8 +212,12 @@ pub async fn libpq_listener_main(
                 // Connection established. Spawn a new task to handle it.
                 debug!("accepted connection from {}", peer_addr);
                 let local_auth = auth.clone();
-                let connection_ctx = listener_ctx
-                    .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
+                let connection_ctx = RequestContextBuilder::from(&listener_ctx)
+                    .task_kind(TaskKind::PageRequestHandler)
+                    .download_behavior(DownloadBehavior::Download)
+                    .perf_span_dispatch(perf_trace_dispatch.clone())
+                    .detached_child();
+
                 connection_handler_tasks.spawn(page_service_conn_main(
                     conf,
                     tenant_manager.clone(),
@@ -607,6 +618,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
     req: PagestreamGetPageRequest,
     timer: SmgrOpTimer,
+    ctx: RequestContext,
 }
 
 #[cfg(feature = "testing")]
@@ -743,6 +755,7 @@ impl PageServerHandler {
         tenant_id: TenantId,
         timeline_id: TimelineId,
         timeline_handles: &mut TimelineHandles,
+        tracing_config: Option<&Tracing>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
         protocol_version: PagestreamProtocolVersion,
@@ -902,10 +915,51 @@ impl PageServerHandler {
                 }
 
                 let key = rel_block_to_key(req.rel, req.blkno);
-                let shard = match timeline_handles
+
+                let sampled = match tracing_config {
+                    Some(conf) => {
+                        let ratio = &conf.sampling_ratio;
+
+                        if ratio.numerator == 0 {
+                            false
+                        } else {
+                            rand::thread_rng().gen_range(0..ratio.denominator) < ratio.numerator
+                        }
+                    }
+                    None => false,
+                };
+
+                let ctx = if sampled {
+                    RequestContextBuilder::from(ctx)
+                        .root_perf_span(|| {
+                            info_span!(
+                            target: PERF_TRACE_TARGET,
+                            "GET_PAGE",
+                            tenant_id = %tenant_id,
+                            shard_id = field::Empty,
+                            timeline_id = %timeline_id,
+                            lsn = %req.hdr.request_lsn,
+                            request_id = %req.hdr.reqid,
+                            key = %key,
+                            )
+                        })
+                        .attached_child()
+                } else {
+                    ctx.attached_child()
+                };
+
+                let res = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                    .await
-                {
+                    .maybe_perf_instrument(&ctx, |current_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: current_perf_span,
+                            "SHARD_SELECTION",
+                        )
+                    })
+                    .await;
+
+                let shard = match res {
                     Ok(tl) => tl,
                     Err(e) => {
                         let span = mkspan!(before shard routing);
@@ -932,26 +986,60 @@ impl PageServerHandler {
                         }
                     }
                 };
+
+                // This ctx travels as part of the BatchedFeMessage through
+                // batching into the request handler.
+                // The request handler needs to do some per-request work
+                // (relsize check) before dispatching the batch as a single
+                // get_vectored call to the Timeline.
+                // This ctx will be used for the reslize check, whereas the
+                // get_vectored call will be a different ctx with separate
+                // perf span.
+                let ctx = ctx.with_scope_page_service_pagestream(&shard);
+
+                // Similar game for this `span`: we funnel it through so that
+                // request handler log messages contain the request-specific fields.
                 let span = mkspan!(shard.tenant_shard_id.shard_slug());
 
+                // Enrich the perf span with shard_id now that shard routing is done.
+                ctx.perf_span_record(
+                    "shard_id",
+                    tracing::field::display(shard.get_shard_identity().shard_slug()),
+                );
+
                 let timer = record_op_start_and_throttle(
                     &shard,
                     metrics::SmgrQueryType::GetPageAtLsn,
                     received_at,
                 )
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "THROTTLE",
+                    )
+                })
                 .await?;
 
                 // We're holding the Handle
-                let effective_request_lsn = match Self::wait_or_get_last_lsn(
+                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
+                let res = Self::wait_or_get_last_lsn(
                     &shard,
                     req.hdr.request_lsn,
                     req.hdr.not_modified_since,
                     &shard.get_applied_gc_cutoff_lsn(),
-                    ctx,
+                    &ctx,
                 )
-                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                .await
-                {
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "WAIT_LSN",
+                    )
+                })
+                .await;
+
+                let effective_request_lsn = match res {
                     Ok(lsn) => lsn,
                     Err(e) => {
                         return respond_error!(span, e);
@@ -961,7 +1049,7 @@ impl PageServerHandler {
                     span,
                     shard: shard.downgrade(),
                     effective_request_lsn,
-                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
+                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
                 }
             }
             #[cfg(feature = "testing")]
@@ -1514,12 +1602,15 @@ impl PageServerHandler {
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
     {
         let cancel = self.cancel.clone();
+        let tracing_config = self.conf.tracing.clone();
+
         let err = loop {
             let msg = Self::pagestream_read_message(
                 &mut pgb_reader,
                 tenant_id,
                 timeline_id,
                 &mut timeline_handles,
+                tracing_config.as_ref(),
                 &cancel,
                 ctx,
                 protocol_version,
@@ -1653,6 +1744,8 @@ impl PageServerHandler {
         // Batcher
         //
 
+        let tracing_config = self.conf.tracing.clone();
+
         let cancel_batcher = self.cancel.child_token();
         let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
         let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
@@ -1666,6 +1759,7 @@ impl PageServerHandler {
                         tenant_id,
                         timeline_id,
                         &mut timeline_handles,
+                        tracing_config.as_ref(),
                         &cancel_batcher,
                         &ctx,
                         protocol_version,
@@ -2004,7 +2098,9 @@ impl PageServerHandler {
 
         let results = timeline
             .get_rel_page_at_lsn_batched(
-                requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
+                requests
+                    .iter()
+                    .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
                 effective_lsn,
                 io_concurrency,
                 ctx,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4685f9383b..e3e06ab91a 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,6 +9,7 @@
 use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};
 
+use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, ensure};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
@@ -31,7 +32,7 @@ use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, trace, warn};
+use tracing::{debug, info, info_span, trace, warn};
 use utils::bin_ser::{BeSer, DeserializeError};
 use utils::lsn::Lsn;
 use utils::pausable_failpoint;
@@ -39,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
-use crate::context::RequestContext;
+use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
     RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
@@ -209,7 +210,9 @@ impl Timeline {
                 let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                 let res = self
                     .get_rel_page_at_lsn_batched(
-                        pages.iter().map(|(tag, blknum)| (tag, blknum)),
+                        pages
+                            .iter()
+                            .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
                         effective_lsn,
                         io_concurrency.clone(),
                         ctx,
@@ -248,7 +251,7 @@ impl Timeline {
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
         effective_lsn: Lsn,
         io_concurrency: IoConcurrency,
         ctx: &RequestContext,
@@ -262,8 +265,11 @@ impl Timeline {
         let mut result = Vec::with_capacity(pages.len());
         let result_slots = result.spare_capacity_mut();
 
-        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
-        for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
+        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
+            BTreeMap::default();
+
+        let mut perf_instrument = false;
+        for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
             if tag.relnode == 0 {
                 result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                     RelationError::InvalidRelnode.into(),
@@ -274,7 +280,16 @@ impl Timeline {
             }
 
             let nblocks = match self
-                .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
+                .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "GET_REL_SIZE",
+                        reltag=%tag,
+                        lsn=%effective_lsn,
+                    )
+                })
                 .await
             {
                 Ok(nblocks) => nblocks,
@@ -297,8 +312,12 @@ impl Timeline {
 
             let key = rel_block_to_key(*tag, *blknum);
 
+            if ctx.has_perf_span() {
+                perf_instrument = true;
+            }
+
             let key_slots = keys_slots.entry(key).or_default();
-            key_slots.push(response_slot_idx);
+            key_slots.push((response_slot_idx, ctx));
         }
 
         let keyspace = {
@@ -314,16 +333,34 @@ impl Timeline {
             acc.to_keyspace()
         };
 
-        match self
-            .get_vectored(keyspace, effective_lsn, io_concurrency, ctx)
-            .await
-        {
+        let ctx = match perf_instrument {
+            true => RequestContextBuilder::from(ctx)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "GET_VECTORED",
+                        tenant_id = %self.tenant_shard_id.tenant_id,
+                        timeline_id = %self.timeline_id,
+                        lsn = %effective_lsn,
+                        shard = %self.tenant_shard_id.shard_slug(),
+                    )
+                })
+                .attached_child(),
+            false => ctx.attached_child(),
+        };
+
+        let res = self
+            .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
+            .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
+            .await;
+
+        match res {
             Ok(results) => {
                 for (key, res) in results {
                     let mut key_slots = keys_slots.remove(&key).unwrap().into_iter();
-                    let first_slot = key_slots.next().unwrap();
+                    let (first_slot, first_req_ctx) = key_slots.next().unwrap();
 
-                    for slot in key_slots {
+                    for (slot, req_ctx) in key_slots {
                         let clone = match &res {
                             Ok(buf) => Ok(buf.clone()),
                             Err(err) => Err(match err {
@@ -341,17 +378,22 @@ impl Timeline {
                         };
 
                         result_slots[slot].write(clone);
+                        // There is no standardized way to express that the batched span followed from N request spans.
+                        // So, abuse the system and mark the request contexts as follows_from the batch span, so we get
+                        // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
+                        req_ctx.perf_follows_from(&ctx);
                         slots_filled += 1;
                     }
 
                     result_slots[first_slot].write(res);
+                    first_req_ctx.perf_follows_from(&ctx);
                     slots_filled += 1;
                 }
             }
             Err(err) => {
                 // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
                 // (We enforce the max batch size outside of this function, in the code that constructs the batch request.)
-                for slot in keys_slots.values().flatten() {
+                for (slot, req_ctx) in keys_slots.values().flatten() {
                     // this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
                     // but without taking ownership of the GetVectoredError
                     let err = match &err {
@@ -383,6 +425,7 @@ impl Timeline {
                         }
                     };
 
+                    req_ctx.perf_follows_from(&ctx);
                     result_slots[*slot].write(err);
                 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0384fcc39f..441597d77f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4205,9 +4205,9 @@ impl Tenant {
             self.cancel.child_token(),
         );
 
-        let timeline_ctx = RequestContextBuilder::extend(ctx)
+        let timeline_ctx = RequestContextBuilder::from(ctx)
             .scope(context::Scope::new_timeline(&timeline))
-            .build();
+            .detached_child();
 
         Ok((timeline, timeline_ctx))
     }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index ece163b24a..2ea0c1b979 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -13,13 +13,13 @@ pub mod merge_iterator;
 use std::cmp::Ordering;
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
-use std::future::Future;
 use std::ops::Range;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::sync::atomic::AtomicUsize;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
+use crate::PERF_TRACE_TARGET;
 pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter};
 use bytes::Bytes;
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
@@ -34,7 +34,7 @@ use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
-use tracing::{Instrument, trace};
+use tracing::{Instrument, info_span, trace};
 use utils::lsn::Lsn;
 use utils::sync::gate::GateGuard;
 
@@ -43,7 +43,9 @@ use super::PageReconstructError;
 use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
 use crate::config::PageServerConf;
-use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::context::{
+    AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
+};
 
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -874,13 +876,37 @@ impl ReadableLayer {
     ) -> Result<(), GetVectoredError> {
         match self {
             ReadableLayer::PersistentLayer(layer) => {
+                let ctx = RequestContextBuilder::from(ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "PLAN_LAYER",
+                            layer = %layer
+                        )
+                    })
+                    .attached_child();
+
                 layer
-                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx)
+                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                     .await
             }
             ReadableLayer::InMemoryLayer(layer) => {
+                let ctx = RequestContextBuilder::from(ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "PLAN_LAYER",
+                            layer = %layer
+                        )
+                    })
+                    .attached_child();
+
                 layer
-                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx)
+                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                     .await
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 62adae1680..05b0bc1a5c 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -896,9 +896,9 @@ impl DeltaLayerInner {
     where
         Reader: BlockReader + Clone,
     {
-        let ctx = RequestContextBuilder::extend(ctx)
+        let ctx = RequestContextBuilder::from(ctx)
             .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-            .build();
+            .attached_child();
 
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
@@ -1105,9 +1105,9 @@ impl DeltaLayerInner {
                     all_keys.push(entry);
                     true
                 },
-                &RequestContextBuilder::extend(ctx)
+                &RequestContextBuilder::from(ctx)
                     .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                    .build(),
+                    .attached_child(),
             )
             .await?;
         if let Some(last) = all_keys.last_mut() {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b211eb5416..3243b73942 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -481,9 +481,9 @@ impl ImageLayerInner {
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
-        let ctx = RequestContextBuilder::extend(ctx)
+        let ctx = RequestContextBuilder::from(ctx)
             .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-            .build();
+            .attached_child();
 
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index bb4ae38ad1..388ed3201c 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -421,9 +421,9 @@ impl InMemoryLayer {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let ctx = RequestContextBuilder::extend(ctx)
+        let ctx = RequestContextBuilder::from(ctx)
             .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
+            .attached_child();
 
         let inner = self.inner.read().await;
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 247092bf45..39665d2cc2 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -3,12 +3,13 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
 use std::time::{Duration, SystemTime};
 
+use crate::PERF_TRACE_TARGET;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::HistoricLayerInfo;
 use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
-use tracing::Instrument;
+use tracing::{Instrument, info_span};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -18,7 +19,7 @@ use super::delta_layer::{self};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
+    LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState,
 };
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
@@ -324,16 +325,29 @@ impl Layer {
         reconstruct_data: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let downloaded =
+        let downloaded = {
+            let ctx = RequestContextBuilder::from(ctx)
+                .perf_span(|crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "GET_LAYER",
+                    )
+                })
+                .attached_child();
+
             self.0
-                .get_or_maybe_download(true, ctx)
+                .get_or_maybe_download(true, &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone())
                 .await
                 .map_err(|err| match err {
                     DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
                         GetVectoredError::Cancelled
                     }
                     other => GetVectoredError::Other(anyhow::anyhow!(other)),
-                })?;
+                })?
+        };
+
         let this = ResidentLayer {
             downloaded: downloaded.clone(),
             owner: self.clone(),
@@ -341,9 +355,20 @@ impl Layer {
 
         self.record_access(ctx);
 
+        let ctx = RequestContextBuilder::from(ctx)
+            .perf_span(|crnt_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: crnt_perf_span,
+                    "VISIT_LAYER",
+                )
+            })
+            .attached_child();
+
         downloaded
-            .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
+            .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
+            .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
             .await
             .map_err(|err| match err {
                 GetVectoredError::Other(err) => GetVectoredError::Other(
@@ -1045,15 +1070,34 @@ impl LayerInner {
             return Err(DownloadError::DownloadRequired);
         }
 
-        let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download);
+        let ctx = if ctx.has_perf_span() {
+            let dl_ctx = RequestContextBuilder::from(ctx)
+                .task_kind(TaskKind::LayerDownload)
+                .download_behavior(DownloadBehavior::Download)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "DOWNLOAD_LAYER",
+                        layer = %self,
+                        reason = %reason
+                    )
+                })
+                .detached_child();
+            ctx.perf_follows_from(&dl_ctx);
+            dl_ctx
+        } else {
+            ctx.attached_child()
+        };
 
         async move {
             tracing::info!(%reason, "downloading on-demand");
 
             let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
             let res = self
-                .download_init_and_wait(timeline, permit, download_ctx)
+                .download_init_and_wait(timeline, permit, ctx.attached_child())
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                 .await?;
+
             scopeguard::ScopeGuard::into_inner(init_cancelled);
             Ok(res)
         }
@@ -1720,9 +1764,9 @@ impl DownloadedLayer {
             );
 
             let res = if owner.desc.is_delta {
-                let ctx = RequestContextBuilder::extend(ctx)
+                let ctx = RequestContextBuilder::from(ctx)
                     .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
-                    .build();
+                    .attached_child();
                 let summary = Some(delta_layer::Summary::expected(
                     owner.desc.tenant_shard_id.tenant_id,
                     owner.desc.timeline_id,
@@ -1738,9 +1782,9 @@ impl DownloadedLayer {
                 .await
                 .map(LayerKind::Delta)
             } else {
-                let ctx = RequestContextBuilder::extend(ctx)
+                let ctx = RequestContextBuilder::from(ctx)
                     .page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
-                    .build();
+                    .attached_child();
                 let lsn = owner.desc.image_layer_lsn();
                 let summary = Some(image_layer::Summary::expected(
                     owner.desc.tenant_shard_id.tenant_id,
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 7086429bfe..b6fd4678d6 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -119,6 +119,10 @@ async fn smoke_test() {
     let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
     assert!(matches!(e, EvictionError::NotFound));
 
+    let dl_ctx = RequestContextBuilder::from(ctx)
+        .download_behavior(DownloadBehavior::Download)
+        .attached_child();
+
     // on accesses when the layer is evicted, it will automatically be downloaded.
     let img_after = {
         let mut data = ValuesReconstructState::new(io_concurrency.clone());
@@ -127,7 +131,7 @@ async fn smoke_test() {
                 controlfile_keyspace.clone(),
                 Lsn(0x10)..Lsn(0x11),
                 &mut data,
-                ctx,
+                &dl_ctx,
             )
             .instrument(download_span.clone())
             .await
@@ -177,7 +181,7 @@ async fn smoke_test() {
 
     // plain downloading is rarely needed
     layer
-        .download_and_keep_resident(ctx)
+        .download_and_keep_resident(&dl_ctx)
         .instrument(download_span)
         .await
         .unwrap();
@@ -645,9 +649,10 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
     let ctx = ctx.with_scope_timeline(&timeline);
 
     // This test does downloads
-    let ctx = RequestContextBuilder::extend(&ctx)
+    let ctx = RequestContextBuilder::from(&ctx)
         .download_behavior(DownloadBehavior::Download)
-        .build();
+        .attached_child();
+
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
@@ -730,9 +735,9 @@ async fn evict_and_wait_does_not_wait_for_download() {
     let ctx = ctx.with_scope_timeline(&timeline);
 
     // This test does downloads
-    let ctx = RequestContextBuilder::extend(&ctx)
+    let ctx = RequestContextBuilder::from(&ctx)
         .download_behavior(DownloadBehavior::Download)
-        .build();
+        .attached_child();
 
     let layer = {
         let mut layers = {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 80a23bfa94..74e97653d2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,6 +23,7 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, Result, anyhow, bail, ensure};
 use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
@@ -96,7 +97,9 @@ use super::{
 };
 use crate::aux_file::AuxFileSizeEstimator;
 use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
+};
 use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
@@ -1289,9 +1292,22 @@ impl Timeline {
         };
         reconstruct_state.read_path = read_path;
 
-        let traversal_res: Result<(), _> = self
-            .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
-            .await;
+        let traversal_res: Result<(), _> = {
+            let ctx = RequestContextBuilder::from(ctx)
+                .perf_span(|crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "PLAN_IO",
+                    )
+                })
+                .attached_child();
+
+            self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
+                .await
+        };
+
         if let Err(err) = traversal_res {
             // Wait for all the spawned IOs to complete.
             // See comments on `spawn_io` inside `storage_layer` for more details.
@@ -1305,14 +1321,46 @@ impl Timeline {
 
         let layers_visited = reconstruct_state.get_layers_visited();
 
+        let ctx = RequestContextBuilder::from(ctx)
+            .perf_span(|crnt_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: crnt_perf_span,
+                    "RECONSTRUCT",
+                )
+            })
+            .attached_child();
+
         let futs = FuturesUnordered::new();
         for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
             futs.push({
                 let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                let ctx = RequestContextBuilder::from(&ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "RECONSTRUCT_KEY",
+                            key = %key,
+                        )
+                    })
+                    .attached_child();
+
                 async move {
                     assert_eq!(state.situation, ValueReconstructSituation::Complete);
 
-                    let converted = match state.collect_pending_ios().await {
+                    let res = state
+                        .collect_pending_ios()
+                        .maybe_perf_instrument(&ctx, |crnt_perf_span| {
+                            info_span!(
+                                target: PERF_TRACE_TARGET,
+                                parent: crnt_perf_span,
+                                "WAIT_FOR_IO_COMPLETIONS",
+                            )
+                        })
+                        .await;
+
+                    let converted = match res {
                         Ok(ok) => ok,
                         Err(err) => {
                             return (key, Err(err));
@@ -1329,16 +1377,27 @@ impl Timeline {
                         "{converted:?}"
                     );
 
-                    (
-                        key,
-                        walredo_self.reconstruct_value(key, lsn, converted).await,
-                    )
+                    let walredo_deltas = converted.num_deltas();
+                    let walredo_res = walredo_self
+                        .reconstruct_value(key, lsn, converted)
+                        .maybe_perf_instrument(&ctx, |crnt_perf_span| {
+                            info_span!(
+                                target: PERF_TRACE_TARGET,
+                                parent: crnt_perf_span,
+                                "WALREDO",
+                                deltas = %walredo_deltas,
+                            )
+                        })
+                        .await;
+
+                    (key, walredo_res)
                 }
             });
         }
 
         let results = futs
             .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
             .await;
 
         // For aux file keys (v1 or v2) the vectored read path does not return an error
@@ -3875,15 +3934,30 @@ impl Timeline {
             let TimelineVisitOutcome {
                 completed_keyspace: completed,
                 image_covered_keyspace,
-            } = Self::get_vectored_reconstruct_data_timeline(
-                timeline,
-                keyspace.clone(),
-                cont_lsn,
-                reconstruct_state,
-                &self.cancel,
-                ctx,
-            )
-            .await?;
+            } = {
+                let ctx = RequestContextBuilder::from(ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "PLAN_IO_TIMELINE",
+                            timeline = %timeline.timeline_id,
+                            lsn = %cont_lsn,
+                        )
+                    })
+                    .attached_child();
+
+                Self::get_vectored_reconstruct_data_timeline(
+                    timeline,
+                    keyspace.clone(),
+                    cont_lsn,
+                    reconstruct_state,
+                    &self.cancel,
+                    &ctx,
+                )
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
+                .await?
+            };
 
             keyspace.remove_overlapping_with(&completed);
 
@@ -3927,8 +4001,24 @@ impl Timeline {
 
             // Take the min to avoid reconstructing a page with data newer than request Lsn.
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
+
+            let ctx = RequestContextBuilder::from(ctx)
+                .perf_span(|crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "GET_ANCESTOR",
+                        timeline = %timeline.timeline_id,
+                        lsn = %cont_lsn,
+                        ancestor = %ancestor_timeline.timeline_id,
+                        ancestor_lsn = %timeline.ancestor_lsn
+                    )
+                })
+                .attached_child();
+
             timeline_owned = timeline
-                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                .get_ready_ancestor_timeline(ancestor_timeline, &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                 .await?;
             timeline = &*timeline_owned;
         };
@@ -7259,9 +7349,9 @@ mod tests {
 
             eprintln!("Downloading {layer} and re-generating heatmap");
 
-            let ctx = &RequestContextBuilder::extend(ctx)
+            let ctx = &RequestContextBuilder::from(ctx)
                 .download_behavior(crate::context::DownloadBehavior::Download)
-                .build();
+                .attached_child();
 
             let _resident = layer
                 .download_and_keep_resident(ctx)
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9693d232ee..2ebb1d50cd 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1029,9 +1029,9 @@ impl Timeline {
         {
             Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
                 // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
+                let image_ctx = RequestContextBuilder::from(ctx)
                     .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
+                    .attached_child();
 
                 let mut partitioning = dense_partitioning;
                 partitioning
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7931a0a7d0..11ff2921b9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -376,6 +376,28 @@ class PageserverWalReceiverProtocol(StrEnum):
             raise ValueError(f"Unknown protocol type: {proto}")
 
 
+@dataclass
+class PageserverTracingConfig:
+    sampling_ratio: tuple[int, int]
+    endpoint: str
+    protocol: str
+    timeout: str
+
+    def to_config_key_value(self) -> tuple[str, dict[str, Any]]:
+        value = {
+            "sampling_ratio": {
+                "numerator": self.sampling_ratio[0],
+                "denominator": self.sampling_ratio[1],
+            },
+            "export_config": {
+                "endpoint": self.endpoint,
+                "protocol": self.protocol,
+                "timeout": self.timeout,
+            },
+        }
+        return ("tracing", value)
+
+
 class NeonEnvBuilder:
     """
     Builder object to create a Neon runtime environment
@@ -425,6 +447,7 @@ class NeonEnvBuilder:
         pageserver_virtual_file_io_mode: str | None = None,
         pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
         pageserver_get_vectored_concurrent_io: str | None = None,
+        pageserver_tracing_config: PageserverTracingConfig | None = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -478,6 +501,8 @@ class NeonEnvBuilder:
             pageserver_get_vectored_concurrent_io
         )
 
+        self.pageserver_tracing_config = pageserver_tracing_config
+
         self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = (
             pageserver_default_tenant_config_compaction_algorithm
         )
@@ -1138,6 +1163,7 @@ class NeonEnv:
         self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
         self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol
         self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io
+        self.pageserver_tracing_config = config.pageserver_tracing_config
 
         # Create the neon_local's `NeonLocalInitConf`
         cfg: dict[str, Any] = {
@@ -1262,6 +1288,14 @@ class NeonEnv:
                 if key not in ps_cfg:
                     ps_cfg[key] = value
 
+            if self.pageserver_tracing_config is not None:
+                key, value = self.pageserver_tracing_config.to_config_key_value()
+
+                if key not in ps_cfg:
+                    ps_cfg[key] = value
+
+                ps_cfg[key] = value
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"])
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 27ae5507b1..24c856e279 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -110,6 +110,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*delaying layer flush by \\S+ for compaction backpressure.*",
     ".*stalling layer flushes for compaction backpressure.*",
     ".*layer roll waiting for flush due to compaction backpressure.*",
+    ".*BatchSpanProcessor.*",
 )
 
 
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 6cbbad4bd9..8874fe663b 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -10,6 +10,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    PageserverTracingConfig,
     PgBin,
     wait_for_last_flush_lsn,
 )
@@ -111,6 +112,15 @@ def setup_and_run_pagebench_benchmark(
     neon_env_builder.pageserver_config_override = (
         f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
     )
+
+    tracing_config = PageserverTracingConfig(
+        sampling_ratio=(0, 1000),
+        endpoint="http://localhost:4318/v1/traces",
+        protocol="http-binary",
+        timeout="10s",
+    )
+    neon_env_builder.pageserver_tracing_config = tracing_config
+    ratio = tracing_config.sampling_ratio[0] / tracing_config.sampling_ratio[1]
     params.update(
         {
             "pageserver_config_override.page_cache_size": (
@@ -118,6 +128,7 @@ def setup_and_run_pagebench_benchmark(
                 {"unit": "byte"},
             ),
             "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+            "pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}),
         }
     )
 

From 375df517a01a3903aa8a8a4f67afd94fef4b7275 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 3 Apr 2025 20:43:16 +0200
Subject: [PATCH 041/140] storcon: return 503 instead of 500 if there is no new
 leader yet (#11417)

The leadership transfer protocol between storage controller instances is
as follows, listing the steps for the new pod:

The new pod does these things:

1. new pod comes online. looks in database if there is a leader. if
there is, it asks that leader to step down.
2. the new pod does some operations to come online. they should be
fairly short timed, but it's not zero.
3. the new pod updates the leader entry in the database.

The old pod, once it gets the step down request, changes its internal
state to stepped down. It treats all incoming requests specially now:
instead of processing, it wants to forward them to the new pod. The
forwarding however only works if the new pod is online already, so
before forwarding it reads from the db for a leader (also to get the
address to forward to in the first place).

If the new pod is not online yet, i.e. during step 2 above, the old pod
might legitimately land in the branch which this patch is editing: the
leader in the database is a stepped down instance.

Before, we've returned a `ApiError::InternalServerError`, but that would
print the full backtrace plus an error log. With this patch, we cut down
on the noise, as it's an expected situation to have a short storcon
downtime while we are cutting over to the new instance. A
`ResourceUnavailable` error is not just more fitting, it also doesn't
print a backtrace once encountered, and only prints on the INFO log
level (see `api_error_handler` function).

Fixes #11320
cc #8954
---
 storage_controller/src/http.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 4035a15316..f06e83d720 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1733,9 +1733,9 @@ async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
         };
 
         if *self_addr == leader_addr {
-            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "Leader is stepped down instance"
-            ))));
+            return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable(
+                "Leader is stepped down instance".into(),
+            )));
         }
     }
 

From 381f42519ed59a43b4fed3ea74eeb72e8eeaf39f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 3 Apr 2025 15:40:44 -0400
Subject: [PATCH 042/140] fix(pageserver): skip gc-compaction over sparse
 keyspaces (#11404)

## Problem

Part of https://github.com/neondatabase/neon/issues/11318

It's not 100% safe for now to run gc-compaction over the sparse
keyspace. It might cause deleted file to re-appear if a specific
sequence of operations are done as in the issue, which in reality
doesn't happen due to how we split delta/image layers based on the key
range.

A long-term fix would be either having a separate gc-compaction code
path for metadata keys (as how we have a different code path for
metadata image layer generation), or let the compaction process aware of
the information of "there's an image layer that doesn't contain a key"
so that we can skip the keys.

## Summary of changes

* gc-compaction auto trigger only triggers compaction over the normal
data range.
* do not hold gc_block_guard across the full compaction job, only hold
it during each subcompaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 57 +++++++++++---------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2ebb1d50cd..5aaef8db0c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,7 +26,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
 use pageserver_api::key::{KEY_SIZE, Key};
 use pageserver_api::keyspace::{KeySpace, ShardedRange};
-use pageserver_api::models::CompactInfoResponse;
+use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
@@ -61,7 +61,7 @@ use crate::tenant::timeline::{
     DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
     ResidentLayer, drop_rlock,
 };
-use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block};
+use crate::tenant::{DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
@@ -123,7 +123,6 @@ impl GcCompactionQueueItem {
 #[derive(Default)]
 struct GcCompactionGuardItems {
     notify: Option<tokio::sync::oneshot::Sender<()>>,
-    gc_guard: Option<gc_block::Guard>,
     permit: Option<OwnedSemaphorePermit>,
 }
 
@@ -319,7 +318,12 @@ impl GcCompactionQueue {
                         flags
                     },
                     sub_compaction: true,
-                    compact_key_range: None,
+                    // Only auto-trigger gc-compaction over the data keyspace due to concerns in
+                    // https://github.com/neondatabase/neon/issues/11318.
+                    compact_key_range: Some(CompactKeyRange {
+                        start: Key::MIN,
+                        end: Key::metadata_key_range().start,
+                    }),
                     compact_lsn_range: None,
                     sub_compaction_max_job_size_mb: None,
                 },
@@ -343,7 +347,6 @@ impl GcCompactionQueue {
         info!("compaction job id={} finished", id);
         let mut guard = self.inner.lock().unwrap();
         if let Some(items) = guard.guards.remove(&id) {
-            drop(items.gc_guard);
             if let Some(tx) = items.notify {
                 let _ = tx.send(());
             }
@@ -360,7 +363,6 @@ impl GcCompactionQueue {
         id: GcCompactionJobId,
         options: CompactOptions,
         timeline: &Arc<Timeline>,
-        gc_block: &GcBlock,
         auto: bool,
     ) -> Result<(), CompactionError> {
         info!(
@@ -384,16 +386,6 @@ impl GcCompactionQueue {
             info!("no jobs to run, skipping scheduled compaction task");
             self.notify_and_unblock(id);
         } else {
-            let gc_guard = match gc_block.start().await {
-                Ok(guard) => guard,
-                Err(e) => {
-                    return Err(CompactionError::Other(anyhow!(
-                        "cannot run gc-compaction because gc is blocked: {}",
-                        e
-                    )));
-                }
-            };
-
             let jobs_len = jobs.len();
             let mut pending_tasks = Vec::new();
             // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate.
@@ -428,7 +420,6 @@ impl GcCompactionQueue {
 
             {
                 let mut guard = self.inner.lock().unwrap();
-                guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                 let mut tasks = Vec::new();
                 for task in pending_tasks {
                     let id = guard.next_id();
@@ -518,29 +509,27 @@ impl GcCompactionQueue {
                     info!(
                         "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
                     );
-                    self.handle_sub_compaction(id, options, timeline, gc_block, auto)
+                    self.handle_sub_compaction(id, options, timeline, auto)
                         .await?;
                 } else {
                     // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn
                     // in this branch.
-                    let gc_guard = match gc_block.start().await {
+                    let _gc_guard = match gc_block.start().await {
                         Ok(guard) => guard,
                         Err(e) => {
+                            self.notify_and_unblock(id);
+                            self.clear_running_job();
                             return Err(CompactionError::Other(anyhow!(
                                 "cannot run gc-compaction because gc is blocked: {}",
                                 e
                             )));
                         }
                     };
-                    {
-                        let mut guard = self.inner.lock().unwrap();
-                        guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
-                    }
                     let res = timeline.compact_with_options(cancel, options, ctx).await;
                     let compaction_result = match res {
                         Ok(res) => res,
                         Err(err) => {
-                            warn!(%err, "failed to run gc-compaction, gc unblocked");
+                            warn!(%err, "failed to run gc-compaction");
                             self.notify_and_unblock(id);
                             self.clear_running_job();
                             return Err(err);
@@ -553,7 +542,25 @@ impl GcCompactionQueue {
             }
             GcCompactionQueueItem::SubCompactionJob(options) => {
                 // TODO: error handling, clear the queue if any task fails?
-                let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
+                let _gc_guard = match gc_block.start().await {
+                    Ok(guard) => guard,
+                    Err(e) => {
+                        self.clear_running_job();
+                        return Err(CompactionError::Other(anyhow!(
+                            "cannot run gc-compaction because gc is blocked: {}",
+                            e
+                        )));
+                    }
+                };
+                let res = timeline.compact_with_options(cancel, options, ctx).await;
+                let compaction_result = match res {
+                    Ok(res) => res,
+                    Err(err) => {
+                        warn!(%err, "failed to run gc-compaction subcompaction job");
+                        self.clear_running_job();
+                        return Err(err);
+                    }
+                };
                 if compaction_result == CompactionOutcome::YieldForL0 {
                     // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
                     // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because

From 8ed79ed773d7b525d5a4eff62c794835eb74993a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 3 Apr 2025 22:42:34 +0100
Subject: [PATCH 043/140] build(deps): bump h2 to 4.2.0 (#11437)

## Problem

We switched `h2` from 4.1.0 to a git commit to fix stubgen (in
https://github.com/neondatabase/neon/pull/10491). `h2` 4.2.0 was
released soon after that, so we can switch back to a pinned version.

Expected no changes, as 4.2.0 is the right next commit after the commit
we currently use:
https://github.com/python-hyper/h2/commit/dacd614fed32f6e69ba1718c5bb2b2cca866af1f%5E

## Summary of changes
- Bump `h2` to 4.2.0
---
 poetry.lock    | 18 +++++++-----------
 pyproject.toml |  2 +-
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 96c65fdf05..08732fd641 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1286,24 +1286,20 @@ files = [
 
 [[package]]
 name = "h2"
-version = "4.1.0"
+version = "4.2.0"
 description = "Pure-Python HTTP/2 protocol implementation"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-files = []
-develop = false
+files = [
+    {file = "h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0"},
+    {file = "h2-4.2.0.tar.gz", hash = "sha256:c8a52129695e88b1a0578d8d2cc6842bbd79128ac685463b887ee278126ad01f"},
+]
 
 [package.dependencies]
 hpack = ">=4.1,<5"
 hyperframe = ">=6.1,<7"
 
-[package.source]
-type = "git"
-url = "https://github.com/python-hyper/h2"
-reference = "HEAD"
-resolved_reference = "0b98b244b5fd1fe96100ac14905417a3b70a4286"
-
 [[package]]
 name = "hpack"
 version = "4.1.0"
@@ -3844,4 +3840,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "fb50cb6b291169dce3188560cdb31a14af95647318f8f0f0d718131dbaf1817a"
+content-hash = "7ab1e7b975af34b3271b7c6018fa22a261d3f73c7c0a0403b6b2bb86b5fbd36e"
diff --git a/pyproject.toml b/pyproject.toml
index c5129fac35..c6dfdc223c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
 kafka-python = "^2.0.2"
 jwcrypto = "^1.5.6"
-h2 = {git = "https://github.com/python-hyper/h2"}
+h2 = "^4.2.0"
 types-jwcrypto = "^1.5.0.20240925"
 pyyaml = "^6.0.2"
 types-pyyaml = "^6.0.12.20240917"

From e581b670f4812e39574f00382afa87dd465929b5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 3 Apr 2025 18:00:58 -0500
Subject: [PATCH 044/140] Improve nightly physical replication benchmark
 (#11389)

Log the created project and endpoint IDs and improve typing in the
source code to improve readability.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .../performance/test_physical_replication.py  | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 6351f03e08..df5419f292 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -7,7 +7,6 @@ import traceback
 from typing import TYPE_CHECKING
 
 import psycopg2
-import psycopg2.extras
 import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
@@ -26,7 +25,11 @@ if TYPE_CHECKING:
 
 
 # Granularity of ~0.5 sec
-def measure_replication_lag(master, replica, timeout_sec=600):
+def measure_replication_lag(
+    master: psycopg2.extensions.cursor,
+    replica: psycopg2.extensions.cursor,
+    timeout_sec: int = 600,
+):
     start = time.time()
     master.execute("SELECT pg_current_wal_flush_lsn()")
     master_lsn = Lsn(master.fetchall()[0][0])
@@ -40,7 +43,7 @@ def measure_replication_lag(master, replica, timeout_sec=600):
     raise TimeoutError(f"Replication sync took more than {timeout_sec} sec")
 
 
-def check_pgbench_still_running(pgbench):
+def check_pgbench_still_running(pgbench: subprocess.Popen[str]):
     rc = pgbench.poll()
     if rc is not None:
         raise RuntimeError(f"Pgbench terminated early with return code {rc}")
@@ -61,6 +64,8 @@ def test_ro_replica_lag(
 
     project = neon_api.create_project(pg_version)
     project_id = project["project"]["id"]
+    log.info("Project ID: {}", project_id)
+    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
     neon_api.wait_for_operation_to_finish(project_id)
     error_occurred = False
     try:
@@ -76,6 +81,7 @@ def test_ro_replica_lag(
             endpoint_type="read_only",
             settings={"pg_settings": {"hot_standby_feedback": "on"}},
         )
+        log.info("Replica endpoint ID: {}", replica["endpoint"]["id"])
         replica_env = master_env.copy()
         replica_env["PGHOST"] = replica["endpoint"]["host"]
         neon_api.wait_for_operation_to_finish(project_id)
@@ -191,6 +197,8 @@ def test_replication_start_stop(
 
     project = neon_api.create_project(pg_version)
     project_id = project["project"]["id"]
+    log.info("Project ID: {}", project_id)
+    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
     neon_api.wait_for_operation_to_finish(project_id)
     try:
         branch_id = project["branch"]["id"]
@@ -200,15 +208,15 @@ def test_replication_start_stop(
         )
 
         replicas = []
-        for _ in range(num_replicas):
-            replicas.append(
-                neon_api.create_endpoint(
-                    project_id,
-                    branch_id,
-                    endpoint_type="read_only",
-                    settings={"pg_settings": {"hot_standby_feedback": "on"}},
-                )
+        for i in range(num_replicas):
+            replica = neon_api.create_endpoint(
+                project_id,
+                branch_id,
+                endpoint_type="read_only",
+                settings={"pg_settings": {"hot_standby_feedback": "on"}},
             )
+            log.info("Replica {} endpoint ID: {}", i + 1, replica["endpoint"]["id"])
+            replicas.append(replica)
             neon_api.wait_for_operation_to_finish(project_id)
 
         replica_connstr = [

From a917952b308a06cbc66b92a8c62c55975fea7c0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 4 Apr 2025 02:17:40 +0200
Subject: [PATCH 045/140] Add test_storcon_create_delete_sk_down and make it
 work (#11400)

Adds a test `test_storcon_create_delete_sk_down` which tests the
reconciler and pending op persistence if faced with a temporary
safekeeper downtime during timeline creation or deletion. This is in
contrast to `test_explicit_timeline_creation_storcon`, which tests the
happy path.

We also do some fixes:

* timeline and tenant deletion http requests didn't expect a body, but
`()` sent one.
* we got the tenant deletion http request's return type wrong: it's
supposed to be a hash map
* we add some logging to improve observability
* We fix `list_pending_ops` which had broken code meant to make it
possible to restrict oneself to a single pageserver. But diesel doesn't
support that sadly, or at least I couldn't figure out a way to make it
work. We don't need that functionality, so remove it.
* We add an info span to the heartbeater futures with the node id, so
that there is no context-free msgs like "Backoff: waiting 1.1 seconds
before processing with the task" in the storcon logs. we could also add
the full base url of the node but don't do it as most other log lines
contain that information already, and if we do duplication it should at
least not be verbose. One can always find out the base url from the node
id.

Successor of #11261
Part of #9011
---
 libs/safekeeper_api/src/models.rs             |  2 +
 safekeeper/client/src/mgmt_api.rs             | 27 +++++-
 safekeeper/src/http/routes.rs                 | 18 ++--
 storage_controller/src/heartbeater.rs         |  3 +
 storage_controller/src/persistence.rs         | 13 +--
 storage_controller/src/safekeeper_client.rs   |  2 +-
 .../src/service/safekeeper_reconciler.rs      | 10 +-
 .../regress/test_storage_controller.py        | 95 +++++++++++++++++++
 8 files changed, 140 insertions(+), 30 deletions(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 33ff636a79..20f11edae7 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -227,6 +227,8 @@ pub struct TimelineDeleteResult {
     pub dir_existed: bool,
 }
 
+pub type TenantDeleteResult = std::collections::HashMap<String, TimelineDeleteResult>;
+
 fn lsn_invalid() -> Lsn {
     Lsn::INVALID
 }
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index afef5e792e..5849df0343 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -115,13 +115,17 @@ impl Client {
             "{}/v1/tenant/{}/timeline/{}",
             self.mgmt_api_endpoint, tenant_id, timeline_id
         );
-        let resp = self.request(Method::DELETE, &uri, ()).await?;
+        let resp = self
+            .request_maybe_body(Method::DELETE, &uri, None::<()>)
+            .await?;
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
-    pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result<models::TimelineDeleteResult> {
+    pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result<models::TenantDeleteResult> {
         let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id);
-        let resp = self.request(Method::DELETE, &uri, ()).await?;
+        let resp = self
+            .request_maybe_body(Method::DELETE, &uri, None::<()>)
+            .await?;
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
@@ -197,6 +201,16 @@ impl Client {
         method: Method,
         uri: U,
         body: B,
+    ) -> Result<reqwest::Response> {
+        self.request_maybe_body(method, uri, Some(body)).await
+    }
+
+    /// Send the request and check that the status code is good, with an optional body.
+    async fn request_maybe_body<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: Option<B>,
     ) -> Result<reqwest::Response> {
         let res = self.request_noerror(method, uri, body).await?;
         let response = res.error_from_body().await?;
@@ -208,12 +222,15 @@ impl Client {
         &self,
         method: Method,
         uri: U,
-        body: B,
+        body: Option<B>,
     ) -> Result<reqwest::Response> {
         let mut req = self.client.request(method, uri);
         if let Some(value) = &self.authorization_header {
             req = req.header(reqwest::header::AUTHORIZATION, value.get_contents())
         }
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
+        if let Some(body) = body {
+            req = req.json(&body);
+        }
+        req.send().await.map_err(Error::ReceiveBody)
     }
 }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index b264fe8a1c..8395c88171 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -16,9 +16,9 @@ use http_utils::{RequestExt, RouterBuilder};
 use hyper::{Body, Request, Response, StatusCode};
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::{
-    AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry,
-    TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, TimelineStatus,
-    TimelineTermBumpRequest,
+    AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TenantDeleteResult,
+    TermSwitchApiEntry, TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult,
+    TimelineStatus, TimelineTermBumpRequest,
 };
 use safekeeper_api::{ServerInfo, membership, models};
 use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId};
@@ -83,13 +83,11 @@ async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Bo
         .delete_all_for_tenant(&tenant_id, action)
         .await
         .map_err(ApiError::InternalServerError)?;
-    json_response(
-        StatusCode::OK,
-        delete_info
-            .iter()
-            .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp))
-            .collect::<HashMap<String, TimelineDeleteResult>>(),
-    )
+    let response_body: TenantDeleteResult = delete_info
+        .iter()
+        .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp))
+        .collect::<HashMap<String, TimelineDeleteResult>>();
+    json_response(StatusCode::OK, response_body)
 }
 
 async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 732c4ea443..fe916aa36a 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -12,6 +12,7 @@ use safekeeper_api::models::SafekeeperUtilization;
 use safekeeper_client::mgmt_api;
 use thiserror::Error;
 use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
 use utils::id::NodeId;
 use utils::logging::SecretString;
 
@@ -227,6 +228,7 @@ impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState>
 
                     Some((*node_id, status))
                 }
+                .instrument(tracing::info_span!("heartbeat_ps", %node_id))
             });
         }
 
@@ -369,6 +371,7 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
 
                     Some((*node_id, status))
                 }
+                .instrument(tracing::info_span!("heartbeat_sk", %node_id))
             });
         }
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 99b1a1e887..d25448718f 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1524,25 +1524,14 @@ impl Persistence {
     /// Load pending operations from db.
     pub(crate) async fn list_pending_ops(
         &self,
-        filter_for_sk: Option<NodeId>,
     ) -> DatabaseResult<Vec<TimelinePendingOpPersistence>> {
         use crate::schema::safekeeper_timeline_pending_ops::dsl;
 
-        const FILTER_VAL_1: i64 = 1;
-        const FILTER_VAL_2: i64 = 2;
-        let filter_opt = filter_for_sk.map(|id| id.0 as i64);
         let timeline_from_db = self
             .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| {
                 Box::pin(async move {
                     let from_db: Vec<TimelinePendingOpPersistence> =
-                        dsl::safekeeper_timeline_pending_ops
-                            .filter(
-                                dsl::sk_id
-                                    .eq(filter_opt.unwrap_or(FILTER_VAL_1))
-                                    .and(dsl::sk_id.eq(filter_opt.unwrap_or(FILTER_VAL_2))),
-                            )
-                            .load(conn)
-                            .await?;
+                        dsl::safekeeper_timeline_pending_ops.load(conn).await?;
                     Ok(from_db)
                 })
             })
diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs
index 98e3f74071..988159af4a 100644
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -101,7 +101,7 @@ impl SafekeeperClient {
     pub(crate) async fn delete_tenant(
         &self,
         tenant_id: TenantId,
-    ) -> Result<models::TimelineDeleteResult> {
+    ) -> Result<models::TenantDeleteResult> {
         measured_request!(
             "delete_tenant",
             crate::metrics::Method::Delete,
diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs
index 8e752a8ff1..76e3162617 100644
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -35,6 +35,10 @@ impl SafekeeperReconcilers {
         service: &Arc<Service>,
         reqs: Vec<ScheduleRequest>,
     ) {
+        tracing::info!(
+            "Scheduling {} pending safekeeper ops loaded from db",
+            reqs.len()
+        );
         for req in reqs {
             self.schedule_request(service, req);
         }
@@ -74,7 +78,7 @@ pub(crate) async fn load_schedule_requests(
     service: &Arc<Service>,
     safekeepers: &HashMap<NodeId, Safekeeper>,
 ) -> anyhow::Result<Vec<ScheduleRequest>> {
-    let pending_ops = service.persistence.list_pending_ops(None).await?;
+    let pending_ops = service.persistence.list_pending_ops().await?;
     let mut res = Vec::with_capacity(pending_ops.len());
     for op_persist in pending_ops {
         let node_id = NodeId(op_persist.sk_id as u64);
@@ -232,12 +236,14 @@ impl SafekeeperReconciler {
             let kind = req.kind;
             let tenant_id = req.tenant_id;
             let timeline_id = req.timeline_id;
+            let node_id = req.safekeeper.skp.id;
             self.reconcile_one(req, req_cancel)
                 .instrument(tracing::info_span!(
                     "reconcile_one",
                     ?kind,
                     %tenant_id,
-                    ?timeline_id
+                    ?timeline_id,
+                    %node_id,
                 ))
                 .await;
         }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b9344f2fb4..097c187699 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4073,6 +4073,101 @@ def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvB
     assert reconciles_after_restart == 0
 
 
+@run_only_on_default_postgres("PG version is not interesting here")
+@pytest.mark.parametrize("restart_storcon", [True, False])
+def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart_storcon: bool):
+    """
+    Test that the storcon can create and delete tenants and timelines with a safekeeper being down.
+      - restart_storcon: tests whether the pending ops are persisted.
+        if we don't restart, we test that we don't require it to come from the db.
+    """
+
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+    }
+    env = neon_env_builder.init_start()
+
+    env.safekeepers[0].stop()
+
+    # Wait for heartbeater to pick up that the safekeeper is gone
+    # This isn't really neccessary
+    def logged_offline():
+        env.storage_controller.assert_log_contains(
+            "Heartbeat round complete for 3 safekeepers, 1 offline"
+        )
+
+    wait_until(logged_offline)
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.create_tenant(tenant_id, timeline_id)
+
+    env.safekeepers[1].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}")
+    env.safekeepers[2].assert_log_contains(f"creating new timeline {tenant_id}/{timeline_id}")
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Call to safekeeper.* management API still failed after.*",
+            ".*reconcile_one.*tenant_id={tenant_id}.*Call to safekeeper.* management API still failed after.*",
+        ]
+    )
+
+    if restart_storcon:
+        # Restart the storcon to check that we persist operations
+        env.storage_controller.stop()
+        env.storage_controller.start()
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    with env.endpoints.create("main", tenant_id=tenant_id, config_lines=config_lines) as ep:
+        # endpoint should start.
+        ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
+        ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+
+    env.storage_controller.assert_log_contains("writing pending op for sk id 1")
+    env.safekeepers[0].start()
+
+    # ensure that we applied the operation also for the safekeeper we just brought down
+    def logged_contains_on_sk():
+        env.safekeepers[0].assert_log_contains(
+            f"pulling timeline {tenant_id}/{timeline_id} from safekeeper"
+        )
+
+    wait_until(logged_contains_on_sk)
+
+    env.safekeepers[1].stop()
+
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+
+    # ensure the safekeeper deleted the timeline
+    def timeline_deleted_on_active_sks():
+        env.safekeepers[0].assert_log_contains(
+            f"deleting timeline {tenant_id}/{timeline_id} from disk"
+        )
+        env.safekeepers[2].assert_log_contains(
+            f"deleting timeline {tenant_id}/{timeline_id} from disk"
+        )
+
+    wait_until(timeline_deleted_on_active_sks)
+
+    if restart_storcon:
+        # Restart the storcon to check that we persist operations
+        env.storage_controller.stop()
+        env.storage_controller.start()
+
+    env.safekeepers[1].start()
+
+    # ensure that there is log msgs for the third safekeeper too
+    def timeline_deleted_on_sk():
+        env.safekeepers[1].assert_log_contains(
+            f"deleting timeline {tenant_id}/{timeline_id} from disk"
+        )
+
+    wait_until(timeline_deleted_on_sk)
+
+
 @pytest.mark.parametrize("wrong_az", [True, False])
 def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, wrong_az: bool):
     """

From 497116b76d2716192b6cb2074ea6a4bde7c01412 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 3 Apr 2025 20:06:22 -0500
Subject: [PATCH 046/140] Download extension if it does not exist on the
 filesystem (#11315)

Previously we attempted to download all extensions in CREATE EXTENSION
statements. Extensions like pg_stat_statements and neon are not remote
extensions, but still we were requesting them when
skip_pg_catalog_updates was set to false.

Fixes: https://github.com/neondatabase/neon/issues/11127

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/patches/pg_hint_plan_v16.patch | 29 ----------------------
 compute/patches/pg_hint_plan_v17.patch | 33 --------------------------
 vendor/postgres-v14                    |  2 +-
 vendor/postgres-v15                    |  2 +-
 vendor/postgres-v16                    |  2 +-
 vendor/postgres-v17                    |  2 +-
 vendor/revisions.json                  |  8 +++----
 7 files changed, 8 insertions(+), 70 deletions(-)

diff --git a/compute/patches/pg_hint_plan_v16.patch b/compute/patches/pg_hint_plan_v16.patch
index 1fc3ffa609..e9df2a3446 100644
--- a/compute/patches/pg_hint_plan_v16.patch
+++ b/compute/patches/pg_hint_plan_v16.patch
@@ -2,23 +2,6 @@ diff --git a/expected/ut-A.out b/expected/ut-A.out
 index da723b8..5328114 100644
 --- a/expected/ut-A.out
 +++ b/expected/ut-A.out
-@@ -9,13 +9,16 @@ SET search_path TO public;
- ----
- -- No.A-1-1-3
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- -- No.A-1-2-3
- DROP EXTENSION pg_hint_plan;
- -- No.A-1-1-4
- CREATE SCHEMA other_schema;
- CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- DROP SCHEMA other_schema;
- ----
- ---- No. A-5-1 comment pattern
 @@ -3175,6 +3178,7 @@ SELECT s.query, s.calls
    FROM public.pg_stat_statements s
    JOIN pg_catalog.pg_database d
@@ -27,18 +10,6 @@ index da723b8..5328114 100644
   ORDER BY 1;
                  query                 | calls 
  --------------------------------------+-------
-diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
-index d372459..6282afe 100644
---- a/expected/ut-fdw.out
-+++ b/expected/ut-fdw.out
-@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
- SET client_min_messages TO LOG;
- SET pg_hint_plan.enable_hint TO on;
- CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
- CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
- CREATE USER MAPPING FOR PUBLIC SERVER file_server;
- CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
 diff --git a/sql/ut-A.sql b/sql/ut-A.sql
 index 7c7d58a..4fd1a07 100644
 --- a/sql/ut-A.sql
diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch
index 3442a094eb..a244452cfe 100644
--- a/compute/patches/pg_hint_plan_v17.patch
+++ b/compute/patches/pg_hint_plan_v17.patch
@@ -1,24 +1,3 @@
-diff --git a/expected/ut-A.out b/expected/ut-A.out
-index e7d68a1..65a056c 100644
---- a/expected/ut-A.out
-+++ b/expected/ut-A.out
-@@ -9,13 +9,16 @@ SET search_path TO public;
- ----
- -- No.A-1-1-3
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- -- No.A-1-2-3
- DROP EXTENSION pg_hint_plan;
- -- No.A-1-1-4
- CREATE SCHEMA other_schema;
- CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- DROP SCHEMA other_schema;
- ----
- ---- No. A-5-1 comment pattern
 diff --git a/expected/ut-J.out b/expected/ut-J.out
 index 2fa3c70..314e929 100644
 --- a/expected/ut-J.out
@@ -160,15 +139,3 @@ index a09bd34..0ad227c 100644
  error hint:
  
                      explain_filter                    
-diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
-index 017fa4b..98d989b 100644
---- a/expected/ut-fdw.out
-+++ b/expected/ut-fdw.out
-@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
- SET client_min_messages TO LOG;
- SET pg_hint_plan.enable_hint TO on;
- CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
- CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
- CREATE USER MAPPING FOR PUBLIC SERVER file_server;
- CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 35bc1b0cba..bce3e48d8a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 35bc1b0cba55680e3b37abce4e67a46bb15f3315
+Subproject commit bce3e48d8a72e70e72dfee1b7421fecd0f1b00ac
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 6cea02e23c..4ac24a747c 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 6cea02e23caa950d5f06932491a91b6af8f54360
+Subproject commit 4ac24a747cd897119ce9b20547b3b04eba2cacbd
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 473f68210d..26c7d3f6de 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 473f68210d52ff8508f71c15b0c77c01296f4ace
+Subproject commit 26c7d3f6de6f361c8923bb80d7563853b4a04958
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 22533c63fc..7ec41bf6cd 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 22533c63fc42cdc1dbe138650ba1eca10a70c5d7
+Subproject commit 7ec41bf6cd92a4af751272145fdd590270c491da
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 7b2d5fda8e..0f581dc79e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.4",
-    "22533c63fc42cdc1dbe138650ba1eca10a70c5d7"
+    "7ec41bf6cd92a4af751272145fdd590270c491da"
   ],
   "v16": [
     "16.8",
-    "473f68210d52ff8508f71c15b0c77c01296f4ace"
+    "26c7d3f6de6f361c8923bb80d7563853b4a04958"
   ],
   "v15": [
     "15.12",
-    "6cea02e23caa950d5f06932491a91b6af8f54360"
+    "4ac24a747cd897119ce9b20547b3b04eba2cacbd"
   ],
   "v14": [
     "14.17",
-    "35bc1b0cba55680e3b37abce4e67a46bb15f3315"
+    "bce3e48d8a72e70e72dfee1b7421fecd0f1b00ac"
   ]
 }

From 181af302b5d904fb9a552795af2b3fd8be6324a3 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Fri, 4 Apr 2025 10:30:48 +0400
Subject: [PATCH 047/140] storcon + safekeeper + scrubber: propagate root CA
 certs everywhere (#11418)

## Problem
There are some places in the code where we create `reqwest::Client`
without providing SSL CA certs from `ssl_ca_file`. These will break
after we enable TLS everywhere.
- Part of https://github.com/neondatabase/cloud/issues/22686

## Summary of changes
- Support `ssl_ca_file` in storage scrubber.
- Add `use_https_safekeeper_api` option to safekeeper to use https for
peer requests.
- Propagate SSL CA certs to storage_controller/client, storcon's
ComputeHook, PeerClient and maybe_forward.
---
 control_plane/storcon_cli/src/main.rs         |  8 ++---
 libs/safekeeper_api/src/models.rs             |  3 ++
 .../timeline/import_pgdata/upcall_api.rs      |  8 ++++-
 safekeeper/src/bin/safekeeper.rs              |  6 +++-
 safekeeper/src/http/routes.rs                 |  1 +
 safekeeper/src/lib.rs                         |  2 ++
 safekeeper/src/recovery.rs                    | 30 +++++++++++++++++--
 safekeeper/src/timeline.rs                    |  2 ++
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 storage_broker/benches/rps.rs                 |  1 +
 storage_broker/proto/broker.proto             |  4 ++-
 storage_broker/src/bin/storage_broker.rs      |  1 +
 storage_controller/client/src/control_api.rs  |  6 ++--
 storage_controller/src/compute_hook.rs        | 16 ++++++----
 storage_controller/src/http.rs                | 24 +++++++--------
 storage_controller/src/leadership.rs          | 13 ++++++++
 storage_controller/src/peer_client.rs         |  4 +--
 storage_controller/src/service.rs             |  2 +-
 storage_scrubber/src/lib.rs                   |  4 +--
 storage_scrubber/src/main.rs                  | 23 ++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |  1 +
 21 files changed, 121 insertions(+), 39 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index c503697acc..b7e479d90c 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -385,8 +385,6 @@ where
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
 
-    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
-
     let ssl_ca_certs = match &cli.ssl_ca_file {
         Some(ssl_ca_file) => {
             let buf = tokio::fs::read(ssl_ca_file).await?;
@@ -401,9 +399,11 @@ async fn main() -> anyhow::Result<()> {
     }
     let http_client = http_client.build()?;
 
+    let storcon_client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone());
+
     let mut trimmed = cli.api.to_string();
     trimmed.pop();
-    let vps_client = mgmt_api::Client::new(http_client, trimmed, cli.jwt.as_deref());
+    let vps_client = mgmt_api::Client::new(http_client.clone(), trimmed, cli.jwt.as_deref());
 
     match cli.command {
         Command::NodeRegister {
@@ -1056,7 +1056,7 @@ async fn main() -> anyhow::Result<()> {
             const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
             let mut stream = futures::stream::iter(moves)
                 .map(|mv| {
-                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
+                    let client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone());
                     async move {
                         client
                             .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 20f11edae7..51f88625da 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -71,6 +71,7 @@ pub struct PeerInfo {
     pub ts: Instant,
     pub pg_connstr: String,
     pub http_connstr: String,
+    pub https_connstr: Option<String>,
 }
 
 pub type FullTransactionId = u64;
@@ -261,6 +262,8 @@ pub struct SkTimelineInfo {
     pub safekeeper_connstr: Option<String>,
     #[serde(default)]
     pub http_connstr: Option<String>,
+    #[serde(default)]
+    pub https_connstr: Option<String>,
     // Minimum of all active RO replicas flush LSN
     #[serde(default = "lsn_invalid")]
     pub standby_horizon: Lsn,
diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
index 7c7a4de2fc..352bbbc4d4 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -32,9 +32,15 @@ impl Client {
         let Some(ref base_url) = conf.import_pgdata_upcall_api else {
             anyhow::bail!("import_pgdata_upcall_api is not configured")
         };
+        let mut http_client = reqwest::Client::builder();
+        for cert in &conf.ssl_ca_certs {
+            http_client = http_client.add_root_certificate(cert.clone());
+        }
+        let http_client = http_client.build()?;
+
         Ok(Self {
             base_url: base_url.to_string(),
-            client: reqwest::Client::new(),
+            client: http_client,
             cancel,
             authorization_header: conf
                 .import_pgdata_upcall_api_token
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 6ce43815a6..18aa710916 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -219,7 +219,10 @@ struct Args {
     pub ssl_cert_reload_period: Duration,
     /// Trusted root CA certificates to use in https APIs.
     #[arg(long)]
-    ssl_ca_file: Option<Utf8PathBuf>,
+    pub ssl_ca_file: Option<Utf8PathBuf>,
+    /// Flag to use https for requests to peer's safekeeper API.
+    #[arg(long)]
+    pub use_https_safekeeper_api: bool,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -399,6 +402,7 @@ async fn main() -> anyhow::Result<()> {
         ssl_cert_file: args.ssl_cert_file,
         ssl_cert_reload_period: args.ssl_cert_reload_period,
         ssl_ca_certs,
+        use_https_safekeeper_api: args.use_https_safekeeper_api,
     });
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 8395c88171..312456e5b2 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -536,6 +536,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
         peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
         safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
         http_connstr: sk_info.http_connstr.unwrap_or_else(|| "".to_owned()),
+        https_connstr: sk_info.https_connstr,
         backup_lsn: sk_info.backup_lsn.0,
         local_start_lsn: sk_info.local_start_lsn.0,
         availability_zone: None,
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 54f36939f4..3ca51ba40a 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -121,6 +121,7 @@ pub struct SafeKeeperConf {
     pub ssl_cert_file: Utf8PathBuf,
     pub ssl_cert_reload_period: Duration,
     pub ssl_ca_certs: Vec<Certificate>,
+    pub use_https_safekeeper_api: bool,
 }
 
 impl SafeKeeperConf {
@@ -170,6 +171,7 @@ impl SafeKeeperConf {
             ssl_cert_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_CERT_FILE),
             ssl_cert_reload_period: Duration::from_secs(60),
             ssl_ca_certs: Vec::new(),
+            use_https_safekeeper_api: false,
         }
     }
 }
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index c2760792b8..25b40f5d2e 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -176,6 +176,7 @@ pub struct Donor {
     pub flush_lsn: Lsn,
     pub pg_connstr: String,
     pub http_connstr: String,
+    pub https_connstr: Option<String>,
 }
 
 impl From<&PeerInfo> for Donor {
@@ -186,6 +187,7 @@ impl From<&PeerInfo> for Donor {
             flush_lsn: p.flush_lsn,
             pg_connstr: p.pg_connstr.clone(),
             http_connstr: p.http_connstr.clone(),
+            https_connstr: p.https_connstr.clone(),
         }
     }
 }
@@ -236,11 +238,33 @@ async fn recover(
     conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
     // Learn donor term switch history to figure out starting point.
-    let client = reqwest::Client::new();
+
+    let mut client = reqwest::Client::builder();
+    for cert in &conf.ssl_ca_certs {
+        client = client.add_root_certificate(cert.clone());
+    }
+    let client = client
+        .build()
+        .context("Failed to build http client for recover")?;
+
+    let url = if conf.use_https_safekeeper_api {
+        if let Some(https_connstr) = donor.https_connstr.as_ref() {
+            format!("https://{https_connstr}")
+        } else {
+            anyhow::bail!(
+                "cannot recover from donor {}: \
+                https is enabled, but https_connstr is not specified",
+                donor.sk_id
+            );
+        }
+    } else {
+        format!("http://{}", donor.http_connstr)
+    };
+
     let timeline_info: TimelineStatus = client
         .get(format!(
-            "http://{}/v1/tenant/{}/timeline/{}",
-            donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
+            "{}/v1/tenant/{}/timeline/{}",
+            url, tli.ttid.tenant_id, tli.ttid.timeline_id
         ))
         .send()
         .await?
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index d9ca58104e..e6a7ade9f2 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -50,6 +50,7 @@ fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> Peer
         local_start_lsn: Lsn(sk_info.local_start_lsn),
         pg_connstr: sk_info.safekeeper_connstr.clone(),
         http_connstr: sk_info.http_connstr.clone(),
+        https_connstr: sk_info.https_connstr.clone(),
         ts,
     }
 }
@@ -363,6 +364,7 @@ impl SharedState {
                 .to_owned()
                 .unwrap_or(conf.listen_pg_addr.clone()),
             http_connstr: conf.listen_http_addr.to_owned(),
+            https_connstr: conf.listen_https_addr.to_owned(),
             backup_lsn: self.sk.state().inmem.backup_lsn.0,
             local_start_lsn: self.sk.state().local_start_lsn.0,
             availability_zone: conf.availability_zone.clone(),
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 58913537aa..b3f088d31c 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -184,6 +184,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         ssl_cert_file: Utf8PathBuf::from(""),
         ssl_cert_reload_period: Duration::ZERO,
         ssl_ca_certs: Vec::new(),
+        use_https_safekeeper_api: false,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 86f2dd9a6c..0fef6a58e0 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -141,6 +141,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                 peer_horizon_lsn: 5,
                 safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
                 http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
+                https_connstr: Some("zenith-1-sk-1.local:7678".to_owned()),
                 local_start_lsn: 0,
                 availability_zone: None,
                 standby_horizon: 0,
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index a420fd9c66..3891685589 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -45,8 +45,10 @@ message SafekeeperTimelineInfo {
     uint64 standby_horizon = 14;
     // A connection string to use for WAL receiving.
     string safekeeper_connstr = 10;
-    // HTTP endpoint connection string
+    // HTTP endpoint connection string.
     string http_connstr = 13;
+    // HTTPS endpoint connection string.
+    optional string https_connstr = 15;
     // Availability zone of a safekeeper.
     optional string availability_zone = 11;
 }
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index cc33ec20ff..f1bd7ba708 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -764,6 +764,7 @@ mod tests {
             peer_horizon_lsn: 5,
             safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
             http_connstr: "neon-1-sk-1.local:7677".to_owned(),
+            https_connstr: Some("neon-1-sk-1.local:7678".to_owned()),
             local_start_lsn: 0,
             availability_zone: None,
             standby_horizon: 0,
diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs
index 7888b18aa7..7afc835675 100644
--- a/storage_controller/client/src/control_api.rs
+++ b/storage_controller/client/src/control_api.rs
@@ -10,13 +10,11 @@ pub struct Client {
 }
 
 impl Client {
-    pub fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+    pub fn new(http_client: reqwest::Client, base_url: Url, jwt_token: Option<String>) -> Self {
         Self {
             base_url,
             jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
+            client: http_client,
         }
     }
 
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 0da35d6545..31ab443ccd 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -4,6 +4,7 @@ use std::error::Error as _;
 use std::sync::Arc;
 use std::time::Duration;
 
+use anyhow::Context;
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use futures::StreamExt;
@@ -364,25 +365,28 @@ pub(crate) struct ShardUpdate<'a> {
 }
 
 impl ComputeHook {
-    pub(super) fn new(config: Config) -> Self {
+    pub(super) fn new(config: Config) -> anyhow::Result<Self> {
         let authorization_header = config
             .control_plane_jwt_token
             .clone()
             .map(|jwt| format!("Bearer {}", jwt));
 
-        let client = reqwest::ClientBuilder::new()
-            .timeout(NOTIFY_REQUEST_TIMEOUT)
+        let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT);
+        for cert in &config.ssl_ca_certs {
+            client = client.add_root_certificate(cert.clone());
+        }
+        let client = client
             .build()
-            .expect("Failed to construct HTTP client");
+            .context("Failed to build http client for compute hook")?;
 
-        Self {
+        Ok(Self {
             state: Default::default(),
             config,
             authorization_header,
             neon_local_lock: Default::default(),
             api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
             client,
-        }
+        })
     }
 
     /// For test environments: use neon_local's LocalEnv to update compute
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index f06e83d720..0caf6e3766 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1744,19 +1744,17 @@ async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
     // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and
     // include some leeway to get the timeout for proxied requests.
     const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10);
-    let client = reqwest::ClientBuilder::new()
-        .timeout(PROXIED_REQUEST_TIMEOUT)
-        .build();
-    let client = match client {
-        Ok(client) => client,
-        Err(err) => {
-            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
-                "Failed to build leader client for forwarding while in stepped down state: {err}"
-            ))));
-        }
-    };
 
-    let request: reqwest::Request = match convert_request(req, &client, leader.address).await {
+    let client = state.service.get_http_client().clone();
+
+    let request: reqwest::Request = match convert_request(
+        req,
+        &client,
+        leader.address,
+        PROXIED_REQUEST_TIMEOUT,
+    )
+    .await
+    {
         Ok(r) => r,
         Err(err) => {
             return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -1814,6 +1812,7 @@ async fn convert_request(
     req: hyper::Request<Body>,
     client: &reqwest::Client,
     to_address: String,
+    timeout: Duration,
 ) -> Result<reqwest::Request, ApiError> {
     use std::str::FromStr;
 
@@ -1868,6 +1867,7 @@ async fn convert_request(
         .request(method, uri)
         .headers(headers)
         .body(body)
+        .timeout(timeout)
         .build()
         .map_err(|err| {
             ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
index 5e1d6f3ec9..39c28d60a9 100644
--- a/storage_controller/src/leadership.rs
+++ b/storage_controller/src/leadership.rs
@@ -110,7 +110,20 @@ impl Leadership {
     ) -> Option<GlobalObservedState> {
         tracing::info!("Sending step down request to {leader:?}");
 
+        let mut http_client = reqwest::Client::builder();
+        for cert in &self.config.ssl_ca_certs {
+            http_client = http_client.add_root_certificate(cert.clone());
+        }
+        let http_client = match http_client.build() {
+            Ok(http_client) => http_client,
+            Err(err) => {
+                tracing::error!("Failed to build client for leader step-down request: {err}");
+                return None;
+            }
+        };
+
         let client = PeerClient::new(
+            http_client,
             Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
             self.config.peer_jwt_token.clone(),
         );
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index f3f275dee0..604d1024ba 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -59,11 +59,11 @@ impl ResponseErrorMessageExt for reqwest::Response {
 pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
 
 impl PeerClient {
-    pub(crate) fn new(uri: Uri, jwt: Option<String>) -> Self {
+    pub(crate) fn new(http_client: reqwest::Client, uri: Uri, jwt: Option<String>) -> Self {
         Self {
             uri,
             jwt,
-            client: reqwest::Client::new(),
+            client: http_client,
         }
     }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9f308d9a0b..50f642deaf 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1711,7 +1711,7 @@ impl Service {
             ))),
             config: config.clone(),
             persistence,
-            compute_hook: Arc::new(ComputeHook::new(config.clone())),
+            compute_hook: Arc::new(ComputeHook::new(config.clone())?),
             result_tx,
             heartbeater_ps,
             heartbeater_sk,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 34e43fcc0b..071f0b9756 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -295,8 +295,8 @@ pub struct ControllerClientConfig {
 }
 
 impl ControllerClientConfig {
-    pub fn build_client(self) -> control_api::Client {
-        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
+    pub fn build_client(self, http_client: reqwest::Client) -> control_api::Client {
+        control_api::Client::new(http_client, self.controller_api, Some(self.controller_jwt))
     }
 }
 
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index fb2ab02565..4823c43e10 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -3,7 +3,7 @@ use camino::Utf8PathBuf;
 use clap::{Parser, Subcommand};
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
-use reqwest::{Method, Url};
+use reqwest::{Certificate, Method, Url};
 use storage_controller_client::control_api;
 use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage};
 use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc};
@@ -41,6 +41,10 @@ struct Cli {
     /// If set to true, the scrubber will exit with error code on fatal error.
     #[arg(long, default_value_t = false)]
     exit_code: bool,
+
+    /// Trusted root CA certificates to use in https APIs.
+    #[arg(long)]
+    ssl_ca_file: Option<Utf8PathBuf>,
 }
 
 #[derive(Subcommand, Debug)]
@@ -146,13 +150,28 @@ async fn main() -> anyhow::Result<()> {
 
     tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
 
+    let ssl_ca_certs = match cli.ssl_ca_file.as_ref() {
+        Some(ssl_ca_file) => {
+            tracing::info!("Using ssl root CA file: {ssl_ca_file:?}");
+            let buf = tokio::fs::read(ssl_ca_file).await?;
+            Certificate::from_pem_bundle(&buf)?
+        }
+        None => Vec::new(),
+    };
+
+    let mut http_client = reqwest::Client::builder();
+    for cert in ssl_ca_certs {
+        http_client = http_client.add_root_certificate(cert);
+    }
+    let http_client = http_client.build()?;
+
     let controller_client = cli.controller_api.map(|controller_api| {
         ControllerClientConfig {
             controller_api,
             // Default to no key: this is a convenience when working in a development environment
             controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
         }
-        .build_client()
+        .build_client(http_client)
     });
 
     match cli.command {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 11ff2921b9..86b6043552 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1318,6 +1318,7 @@ class NeonEnv:
                 "http_port": port.http,
                 "https_port": port.https,
                 "sync": config.safekeepers_enable_fsync,
+                "use_https_safekeeper_api": config.use_https_safekeeper_api,
             }
             if config.auth_enabled:
                 sk_cfg["auth_enabled"] = True

From edc874e1b303e25aa87bfc44e4b982499152afb4 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Fri, 4 Apr 2025 12:13:00 +0200
Subject: [PATCH 048/140] Use the same test image version as the computer one
 (#11448)

## Problem
Changes in compute can cause errors in tests if another version of
`neon-test-extensions` image is used.
## Summary of changes
Use the same version of `neon-test-extensions` image as `compute` one
for docker-compose based extension tests.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0ef9baf2f7..46c8cd6fc9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -980,7 +980,7 @@ jobs:
           TEST_EXTENSIONS_TAG: >-
             ${{
               contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
-              && 'latest'
+              && needs.meta.outputs.previous-compute-release
               || needs.meta.outputs.build-tag
             }}
           TEST_VERSION_ONLY: ${{ matrix.pg_version }}

From 65e2aae6e469d4c77e72fdf1a8b97135ac6e45f1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 4 Apr 2025 11:52:59 +0100
Subject: [PATCH 049/140] pageserver/secondary: deregister IO metrics (#11283)

## Problem

IO metrics for secondary locations do not get deregistered when the
timeline is removed.

## Summary of changes

Stash the request context to be used for downloads in
`SecondaryTimelineDetail`. These objects match the lifetime of the
secondary timeline location pretty well.

When the timeline is removed, deregister the metrics too.

Closes https://github.com/neondatabase/neon/issues/11156
---
 pageserver/src/metrics.rs                     |   6 +-
 pageserver/src/tenant/secondary.rs            |   7 ++
 pageserver/src/tenant/secondary/downloader.rs | 116 +++++++++++++++---
 .../regress/test_pageserver_secondary.py      |  67 ++++++++++
 4 files changed, 177 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9820d50e7b..d8497288ca 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1248,13 +1248,13 @@ pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(Storag
 
 #[derive(Clone, Copy)]
 #[repr(usize)]
-enum StorageIoSizeOperation {
+pub(crate) enum StorageIoSizeOperation {
     Read,
     Write,
 }
 
 impl StorageIoSizeOperation {
-    const VARIANTS: &'static [&'static str] = &["read", "write"];
+    pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"];
 
     fn as_str(&self) -> &'static str {
         Self::VARIANTS[*self as usize]
@@ -1262,7 +1262,7 @@ impl StorageIoSizeOperation {
 }
 
 // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub(crate) static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_io_operations_bytes_total",
         "Total amount of bytes read/written in IO operations",
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index a378961620..2fa0ed9be9 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -167,10 +167,17 @@ impl SecondaryTenant {
 
         self.validate_metrics();
 
+        // Metrics are subtracted from and/or removed eagerly.
+        // Deletions are done in the background via [`BackgroundPurges::spawn`].
         let tenant_id = self.tenant_shard_id.tenant_id.to_string();
         let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
         let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
         let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+
+        self.detail
+            .lock()
+            .unwrap()
+            .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric);
     }
 
     pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 1cf0241631..60cf7ac79e 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -4,6 +4,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation};
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
@@ -124,15 +125,53 @@ impl OnDiskState {
     }
 }
 
-#[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
     on_disk_layers: HashMap<LayerName, OnDiskState>,
 
     /// We remember when layers were evicted, to prevent re-downloading them.
     pub(super) evicted_at: HashMap<LayerName, SystemTime>,
+
+    ctx: RequestContext,
+}
+
+impl Clone for SecondaryDetailTimeline {
+    fn clone(&self) -> Self {
+        Self {
+            on_disk_layers: self.on_disk_layers.clone(),
+            evicted_at: self.evicted_at.clone(),
+            // This is a bit awkward. The downloader code operates on a snapshot
+            // of the secondary list to avoid locking it for extended periods of time.
+            // No particularly strong reason to chose [`RequestContext::detached_child`],
+            // but makes more sense than [`RequestContext::attached_child`].
+            ctx: self
+                .ctx
+                .detached_child(self.ctx.task_kind(), self.ctx.download_behavior()),
+        }
+    }
+}
+
+impl std::fmt::Debug for SecondaryDetailTimeline {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SecondaryDetailTimeline")
+            .field("on_disk_layers", &self.on_disk_layers)
+            .field("evicted_at", &self.evicted_at)
+            .finish()
+    }
 }
 
 impl SecondaryDetailTimeline {
+    pub(super) fn empty(ctx: RequestContext) -> Self {
+        SecondaryDetailTimeline {
+            on_disk_layers: Default::default(),
+            evicted_at: Default::default(),
+            ctx,
+        }
+    }
+
+    pub(super) fn context(&self) -> &RequestContext {
+        &self.ctx
+    }
+
     pub(super) fn remove_layer(
         &mut self,
         name: &LayerName,
@@ -258,18 +297,50 @@ impl SecondaryDetail {
 
     pub(super) fn remove_timeline(
         &mut self,
+        tenant_shard_id: &TenantShardId,
         timeline_id: &TimelineId,
         resident_metric: &UIntGauge,
     ) {
         let removed = self.timelines.remove(timeline_id);
         if let Some(removed) = removed {
-            resident_metric.sub(
-                removed
-                    .on_disk_layers
-                    .values()
-                    .map(|l| l.metadata.file_size)
-                    .sum(),
-            );
+            Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric);
+        }
+    }
+
+    pub(super) fn drain_timelines(
+        &mut self,
+        tenant_shard_id: &TenantShardId,
+        resident_metric: &UIntGauge,
+    ) {
+        for (timeline_id, removed) in self.timelines.drain() {
+            Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric);
+        }
+    }
+
+    fn clear_timeline_metrics(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        detail: SecondaryDetailTimeline,
+        resident_metric: &UIntGauge,
+    ) {
+        resident_metric.sub(
+            detail
+                .on_disk_layers
+                .values()
+                .map(|l| l.metadata.file_size)
+                .sum(),
+        );
+
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        for op in StorageIoSizeOperation::VARIANTS {
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[
+                op,
+                tenant_id.as_str(),
+                shard_id.as_str(),
+                timeline_id.as_str(),
+            ]);
         }
     }
 
@@ -727,6 +798,7 @@ impl<'a> TenantDownloader<'a> {
                         last_heatmap,
                         timeline,
                         &self.secondary_state.resident_size_metric,
+                        ctx,
                     )
                     .await;
 
@@ -774,7 +846,6 @@ impl<'a> TenantDownloader<'a> {
 
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
-            let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id);
             let timeline_state = timeline_states
                 .remove(&timeline.timeline_id)
                 .expect("Just populated above");
@@ -917,7 +988,11 @@ impl<'a> TenantDownloader<'a> {
             for delete_timeline in &delete_timelines {
                 // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                 // from disk fails that will be a fatal error.
-                detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
+                detail.remove_timeline(
+                    self.secondary_state.get_tenant_shard_id(),
+                    delete_timeline,
+                    &self.secondary_state.resident_size_metric,
+                );
             }
         }
 
@@ -1013,7 +1088,6 @@ impl<'a> TenantDownloader<'a> {
         timeline: HeatMapTimeline,
         timeline_state: SecondaryDetailTimeline,
         deadline: Instant,
-        ctx: &RequestContext,
     ) -> (Result<(), UpdateError>, Vec<HeatMapLayer>) {
         // Accumulate updates to the state
         let mut touched = Vec::new();
@@ -1044,7 +1118,12 @@ impl<'a> TenantDownloader<'a> {
             }
 
             match self
-                .download_layer(tenant_shard_id, &timeline_id, layer, ctx)
+                .download_layer(
+                    tenant_shard_id,
+                    &timeline_id,
+                    layer,
+                    timeline_state.context(),
+                )
                 .await
             {
                 Ok(Some(layer)) => touched.push(layer),
@@ -1155,13 +1234,16 @@ impl<'a> TenantDownloader<'a> {
         tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
 
         let (result, touched) = self
-            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
+            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline)
             .await;
 
         // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful
         {
             let mut detail = self.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline_id).or_default();
+            let timeline_detail = detail.timelines.entry(timeline_id).or_insert_with(|| {
+                let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline_id);
+                SecondaryDetailTimeline::empty(ctx)
+            });
 
             tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
             touched.into_iter().for_each(|t| {
@@ -1295,10 +1377,12 @@ async fn init_timeline_state(
     last_heatmap: Option<&HeatMapTimeline>,
     heatmap: &HeatMapTimeline,
     resident_metric: &UIntGauge,
+    ctx: &RequestContext,
 ) -> SecondaryDetailTimeline {
-    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
-    let mut detail = SecondaryDetailTimeline::default();
+    let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id);
+    let mut detail = SecondaryDetailTimeline::empty(ctx);
 
+    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
     let mut dir = match tokio::fs::read_dir(&timeline_path).await {
         Ok(d) => d,
         Err(e) => {
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 3749df2229..c73a592d98 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1099,3 +1099,70 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     # Warm up the current secondary.
     ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100)
     wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally))
+
+
+@run_only_on_default_postgres("PG version is not interesting here")
+@pytest.mark.parametrize("action", ["delete_timeline", "detach"])
+def test_io_metrics_match_secondary_timeline_lifecycle(
+    neon_env_builder: NeonEnvBuilder, action: str
+):
+    """
+    Check that IO metrics for secondary timelines are de-registered when the timeline
+    is removed
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    parent_timeline_id = TimelineId.generate()
+
+    # We do heatmap uploads and pulls manually
+    tenant_conf = {"heatmap_period": "0s"}
+    env.create_tenant(
+        tenant_id, parent_timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}'
+    )
+
+    child_timeline_id = env.create_branch("foo", tenant_id)
+
+    attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+    ps_attached = env.get_pageserver(attached_to_id)
+    ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    status, _ = ps_secondary.http_client().tenant_secondary_download(tenant_id, wait_ms=5000)
+    assert status == 200
+
+    labels = {
+        "operation": "write",
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(child_timeline_id),
+    }
+    bytes_written = (
+        ps_secondary.http_client()
+        .get_metrics()
+        .query_one("pageserver_io_operations_bytes_total", labels)
+        .value
+    )
+
+    assert bytes_written == 0
+
+    if action == "delete_timeline":
+        env.storage_controller.pageserver_api().timeline_delete(tenant_id, child_timeline_id)
+        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        status, _ = ps_secondary.http_client().tenant_secondary_download(tenant_id, wait_ms=5000)
+        assert status == 200
+    elif action == "detach":
+        env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
+        env.storage_controller.reconcile_until_idle()
+    else:
+        raise Exception("Unexpected action")
+
+    assert (
+        len(
+            ps_secondary.http_client()
+            .get_metrics()
+            .query_all("pageserver_io_operations_bytes_total", labels)
+        )
+        == 0
+    )

From 1ef4258f290b0401113fcabb800a4ef729cb6649 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 4 Apr 2025 14:41:28 +0100
Subject: [PATCH 050/140] pageserver: add tenant level performance tracing
 sampling ratio (#11433)

## Problem

https://github.com/neondatabase/neon/pull/11140 introduces performance
tracing with OTEL
and a pageserver config which configures the sampling ratio of get page
requests.

Enabling a non-zero sampling ratio on a per region basis is too
aggressive and comes with perf
impact that isn't very well understood yet.

## Summary of changes

Add a `sampling_ratio` tenant level config which overrides the
pageserver level config.
Note that we do not cache the config and load it on every get page
request such that changes propagate
timely.

Note that I've had to remove the `SHARD_SELECTION` span to get this to
work. The tracing library doesn't
expose a neat way to drop a span if one realises it's not needed at
runtime.

Closes https://github.com/neondatabase/neon/issues/11392
---
 control_plane/src/pageserver.rs               |  5 ++
 libs/pageserver_api/src/config.rs             |  6 +-
 libs/pageserver_api/src/models.rs             | 10 +++
 libs/tracing-utils/src/perf_span.rs           | 11 +--
 pageserver/src/context.rs                     | 13 ----
 pageserver/src/page_service.rs                | 73 +++++--------------
 pageserver/src/tenant/timeline.rs             | 25 +++++++
 .../regress/test_attach_tenant_config.py      |  4 +
 8 files changed, 70 insertions(+), 77 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b39acbca4d..591eb3728b 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -545,6 +545,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?,
+            sampling_ratio: settings
+                .remove("sampling_ratio")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Falied to parse 'sampling_ratio'")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 66a02b87b0..d0225c8918 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -192,7 +192,7 @@ pub enum GetVectoredConcurrentIo {
     SidecarTask,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct Ratio {
     pub numerator: usize,
     pub denominator: usize,
@@ -416,6 +416,9 @@ pub struct TenantConfigToml {
     /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
     /// is above this ratio, gc-compaction will be triggered.
     pub gc_compaction_ratio_percent: u64,
+    /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
+    /// that will get perf sampling for the tenant.
+    pub sampling_ratio: Option<Ratio>,
 }
 
 pub mod defaults {
@@ -702,6 +705,7 @@ impl Default for TenantConfigToml {
             gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
             gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
             gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
+            sampling_ratio: None,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f2dd3a0ebf..16d9433973 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
 use utils::postgres_client::PostgresClientProtocol;
 use utils::{completion, serde_system_time};
 
+use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
 use crate::reltag::RelTag;
 use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
@@ -568,6 +569,8 @@ pub struct TenantConfigPatch {
     pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_ratio_percent: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub sampling_ratio: FieldPatch<Option<Ratio>>,
 }
 
 /// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -688,6 +691,9 @@ pub struct TenantConfig {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_compaction_ratio_percent: Option<u64>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sampling_ratio: Option<Option<Ratio>>,
 }
 
 impl TenantConfig {
@@ -730,6 +736,7 @@ impl TenantConfig {
             mut gc_compaction_enabled,
             mut gc_compaction_initial_threshold_kb,
             mut gc_compaction_ratio_percent,
+            mut sampling_ratio,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -824,6 +831,7 @@ impl TenantConfig {
         patch
             .gc_compaction_ratio_percent
             .apply(&mut gc_compaction_ratio_percent);
+        patch.sampling_ratio.apply(&mut sampling_ratio);
 
         Ok(Self {
             checkpoint_distance,
@@ -860,6 +868,7 @@ impl TenantConfig {
             gc_compaction_enabled,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
+            sampling_ratio,
         })
     }
 
@@ -961,6 +970,7 @@ impl TenantConfig {
             gc_compaction_ratio_percent: self
                 .gc_compaction_ratio_percent
                 .unwrap_or(global_conf.gc_compaction_ratio_percent),
+            sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
         }
     }
 }
diff --git a/libs/tracing-utils/src/perf_span.rs b/libs/tracing-utils/src/perf_span.rs
index f2ca76a816..16f713c67e 100644
--- a/libs/tracing-utils/src/perf_span.rs
+++ b/libs/tracing-utils/src/perf_span.rs
@@ -28,7 +28,7 @@ use core::{
     task::{Context, Poll},
 };
 use pin_project_lite::pin_project;
-use tracing::{Dispatch, field, span::Span};
+use tracing::{Dispatch, span::Span};
 
 #[derive(Debug, Clone)]
 pub struct PerfSpan {
@@ -49,15 +49,6 @@ impl PerfSpan {
         }
     }
 
-    pub fn record<Q: field::AsField + ?Sized, V: field::Value>(
-        &self,
-        field: &Q,
-        value: V,
-    ) -> &Self {
-        self.inner.record(field, value);
-        self
-    }
-
     pub fn enter(&self) -> PerfSpanEntered {
         if let Some(ref id) = self.inner.id() {
             self.dispatch.enter(id);
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 279d2daf75..481fdb4ea2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -572,19 +572,6 @@ impl RequestContext {
         }
     }
 
-    pub(crate) fn perf_span_record<
-        Q: tracing::field::AsField + ?Sized,
-        V: tracing::field::Value,
-    >(
-        &self,
-        field: &Q,
-        value: V,
-    ) {
-        if let Some(span) = &self.perf_span {
-            span.record(field, value);
-        }
-    }
-
     pub(crate) fn has_perf_span(&self) -> bool {
         self.perf_span.is_some()
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 3ebd6d8506..f9bf45bb71 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy, Tracing,
+    PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
@@ -37,7 +37,6 @@ use postgres_ffi::BLCKSZ;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use pq_proto::framed::ConnectionError;
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
-use rand::Rng;
 use strum_macros::IntoStaticStr;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
@@ -755,7 +754,6 @@ impl PageServerHandler {
         tenant_id: TenantId,
         timeline_id: TimelineId,
         timeline_handles: &mut TimelineHandles,
-        tracing_config: Option<&Tracing>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
         protocol_version: PagestreamProtocolVersion,
@@ -916,47 +914,8 @@ impl PageServerHandler {
 
                 let key = rel_block_to_key(req.rel, req.blkno);
 
-                let sampled = match tracing_config {
-                    Some(conf) => {
-                        let ratio = &conf.sampling_ratio;
-
-                        if ratio.numerator == 0 {
-                            false
-                        } else {
-                            rand::thread_rng().gen_range(0..ratio.denominator) < ratio.numerator
-                        }
-                    }
-                    None => false,
-                };
-
-                let ctx = if sampled {
-                    RequestContextBuilder::from(ctx)
-                        .root_perf_span(|| {
-                            info_span!(
-                            target: PERF_TRACE_TARGET,
-                            "GET_PAGE",
-                            tenant_id = %tenant_id,
-                            shard_id = field::Empty,
-                            timeline_id = %timeline_id,
-                            lsn = %req.hdr.request_lsn,
-                            request_id = %req.hdr.reqid,
-                            key = %key,
-                            )
-                        })
-                        .attached_child()
-                } else {
-                    ctx.attached_child()
-                };
-
                 let res = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                    .maybe_perf_instrument(&ctx, |current_perf_span| {
-                        info_span!(
-                            target: PERF_TRACE_TARGET,
-                            parent: current_perf_span,
-                            "SHARD_SELECTION",
-                        )
-                    })
                     .await;
 
                 let shard = match res {
@@ -987,6 +946,25 @@ impl PageServerHandler {
                     }
                 };
 
+                let ctx = if shard.is_get_page_request_sampled() {
+                    RequestContextBuilder::from(ctx)
+                        .root_perf_span(|| {
+                            info_span!(
+                            target: PERF_TRACE_TARGET,
+                            "GET_PAGE",
+                            tenant_id = %tenant_id,
+                            shard_id = %shard.get_shard_identity().shard_slug(),
+                            timeline_id = %timeline_id,
+                            lsn = %req.hdr.request_lsn,
+                            request_id = %req.hdr.reqid,
+                            key = %key,
+                            )
+                        })
+                        .attached_child()
+                } else {
+                    ctx.attached_child()
+                };
+
                 // This ctx travels as part of the BatchedFeMessage through
                 // batching into the request handler.
                 // The request handler needs to do some per-request work
@@ -1001,12 +979,6 @@ impl PageServerHandler {
                 // request handler log messages contain the request-specific fields.
                 let span = mkspan!(shard.tenant_shard_id.shard_slug());
 
-                // Enrich the perf span with shard_id now that shard routing is done.
-                ctx.perf_span_record(
-                    "shard_id",
-                    tracing::field::display(shard.get_shard_identity().shard_slug()),
-                );
-
                 let timer = record_op_start_and_throttle(
                     &shard,
                     metrics::SmgrQueryType::GetPageAtLsn,
@@ -1602,7 +1574,6 @@ impl PageServerHandler {
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
     {
         let cancel = self.cancel.clone();
-        let tracing_config = self.conf.tracing.clone();
 
         let err = loop {
             let msg = Self::pagestream_read_message(
@@ -1610,7 +1581,6 @@ impl PageServerHandler {
                 tenant_id,
                 timeline_id,
                 &mut timeline_handles,
-                tracing_config.as_ref(),
                 &cancel,
                 ctx,
                 protocol_version,
@@ -1744,8 +1714,6 @@ impl PageServerHandler {
         // Batcher
         //
 
-        let tracing_config = self.conf.tracing.clone();
-
         let cancel_batcher = self.cancel.child_token();
         let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
         let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
@@ -1759,7 +1727,6 @@ impl PageServerHandler {
                         tenant_id,
                         timeline_id,
                         &mut timeline_handles,
-                        tracing_config.as_ref(),
                         &cancel_batcher,
                         &ctx,
                         protocol_version,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 74e97653d2..6ca3704bc1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2476,6 +2476,31 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
     }
 
+    /// Checks if a get page request should get perf tracing
+    ///
+    /// The configuration priority is: tenant config override, default tenant config,
+    /// pageserver config.
+    pub(crate) fn is_get_page_request_sampled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        let ratio = tenant_conf
+            .tenant_conf
+            .sampling_ratio
+            .flatten()
+            .or(self.conf.default_tenant_conf.sampling_ratio)
+            .or(self.conf.tracing.as_ref().map(|t| t.sampling_ratio));
+
+        match ratio {
+            Some(r) => {
+                if r.numerator == 0 {
+                    false
+                } else {
+                    rand::thread_rng().gen_range(0..r.denominator) < r.numerator
+                }
+            }
+            None => false,
+        }
+    }
+
     fn get_checkpoint_distance(&self) -> u64 {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 22dfcbda92..5021cc4b17 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,6 +190,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_compaction_initial_threshold_kb": 1024000,
         "gc_compaction_ratio_percent": 200,
         "image_creation_preempt_threshold": 5,
+        "sampling_ratio": {
+            "numerator": 0,
+            "denominator": 10,
+        },
     }
 
     vps_http = env.storage_controller.pageserver_api()

From 8e1b5a9727e27e8d2188030750d56bae4bf5931d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 4 Apr 2025 15:09:15 +0100
Subject: [PATCH 051/140] Fix Postgres build on macOS (#11442)

## Problem
Postgres build fails with the following error on macOS:

```
/Users/bayandin/work/neon//vendor/postgres-v14/src/port/snprintf.c:424:27: error: 'strchrnul' is only available on macOS 15.4 or newer [-Werror,-Wunguarded-availability-new]
  424 |                         const char *next_pct = strchrnul(format + 1, '%');
      |                                                ^~~~~~~~~
/Users/bayandin/work/neon//vendor/postgres-v14/src/port/snprintf.c:376:14: note: 'strchrnul' has been marked as being introduced in macOS 15.4 here, but the deployment target is macOS 15.0.0
  376 | extern char *strchrnul(const char *s, int c);
      |              ^
/Users/bayandin/work/neon//vendor/postgres-v14/src/port/snprintf.c:424:27: note: enclose 'strchrnul' in a __builtin_available check to silence this warning
  424 |                         const char *next_pct = strchrnul(format + 1, '%');
      |                                                ^~~~~~~~~
  425 |
  426 |                         /* Dump literal data we just scanned over */
  427 |                         dostr(format, next_pct - format, target);
  428 |                         if (target->failed)
  429 |                                 break;
  430 |
  431 |                         if (*next_pct == '\0')
  432 |                                 break;
  433 |                         format = next_pct;
      |
1 error generated.
```

## Summary of changes
- Update Postgres fork to include changes from
https://github.com/postgres/postgres/commit/6da2ba1d8a031984eb016fed6741bb2ac945f19d

Corresponding Postgres PRs:
- https://github.com/neondatabase/postgres/pull/608
- https://github.com/neondatabase/postgres/pull/609
- https://github.com/neondatabase/postgres/pull/610
- https://github.com/neondatabase/postgres/pull/611
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index bce3e48d8a..8cca70c22e 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit bce3e48d8a72e70e72dfee1b7421fecd0f1b00ac
+Subproject commit 8cca70c22e2894dd4645f9a940086ac437b0a11b
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 4ac24a747c..23708b3aca 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 4ac24a747cd897119ce9b20547b3b04eba2cacbd
+Subproject commit 23708b3aca9adf163aa0973eb63d9afc0e4a04c3
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 26c7d3f6de..746bd9ffe5 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 26c7d3f6de6f361c8923bb80d7563853b4a04958
+Subproject commit 746bd9ffe5c29bce030eaea1031054057f3c5d45
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 7ec41bf6cd..c9e4ff5a38 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 7ec41bf6cd92a4af751272145fdd590270c491da
+Subproject commit c9e4ff5a38907acd71107634055bf2609aba43a5
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 0f581dc79e..2abfbffccb 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.4",
-    "7ec41bf6cd92a4af751272145fdd590270c491da"
+    "c9e4ff5a38907acd71107634055bf2609aba43a5"
   ],
   "v16": [
     "16.8",
-    "26c7d3f6de6f361c8923bb80d7563853b4a04958"
+    "746bd9ffe5c29bce030eaea1031054057f3c5d45"
   ],
   "v15": [
     "15.12",
-    "4ac24a747cd897119ce9b20547b3b04eba2cacbd"
+    "23708b3aca9adf163aa0973eb63d9afc0e4a04c3"
   ],
   "v14": [
     "14.17",
-    "bce3e48d8a72e70e72dfee1b7421fecd0f1b00ac"
+    "8cca70c22e2894dd4645f9a940086ac437b0a11b"
   ]
 }

From 295be03a33dba9778244234792e4d176a6906b86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 4 Apr 2025 16:56:41 +0200
Subject: [PATCH 052/140] impr(ci): send clearer notifications to slack when
 retrying container image pushes (#11447)

## Problem
We've started sending slack notifications for failed container image
pushes that are being retried. There are more messages coming in than
expected, so clicking through the link to see what image failed is
happening more often than we hoped.

## Summary of changes
- Make slack notifications clearer, including whether the job succeeded
and what retries have happened.
- Log failures/retries in step more clearly, so that you can easily see
when something fails.
---
 .github/scripts/push_with_image_map.py            | 12 ++++++++++--
 .github/workflows/_push-to-container-registry.yml | 13 ++++++++++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/push_with_image_map.py b/.github/scripts/push_with_image_map.py
index 53f83379ae..85e2eb1937 100644
--- a/.github/scripts/push_with_image_map.py
+++ b/.github/scripts/push_with_image_map.py
@@ -2,6 +2,9 @@ import json
 import os
 import subprocess
 
+RED = "\033[91m"
+RESET = "\033[0m"
+
 image_map = os.getenv("IMAGE_MAP")
 if not image_map:
     raise ValueError("IMAGE_MAP environment variable is not set")
@@ -29,9 +32,14 @@ while len(pending) > 0:
     result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     if result.returncode != 0:
-        failures.append((" ".join(cmd), result.stdout))
+        failures.append((" ".join(cmd), result.stdout, target))
         pending.append((source, target))
+        print(
+            f"{RED}[RETRY]{RESET} Push failed for {target}. Retrying... (failure count: {len(failures)})"
+        )
+        print(result.stdout)
 
 if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")):
+    failed_targets = [target for _, _, target in failures]
     with open(github_output, "a") as f:
-        f.write("slack_notify=true\n")
+        f.write(f"push_failures={json.dumps(failed_targets)}\n")
diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml
index 9b3ad0fdbb..7d3a11409b 100644
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -110,12 +110,19 @@ jobs:
           IMAGE_MAP: ${{ inputs.image-map }}
 
       - name: Notify Slack if container image pushing fails
-        if: steps.push.outputs.slack_notify == 'true' || failure()
+        if: steps.push.outputs.push_failures || failure()
         uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
         with:
           method: chat.postMessage
           token: ${{ secrets.SLACK_BOT_TOKEN }}
           payload: |
             channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
-            text: |
-              Pushing container images failed in <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+            text: >
+              *Container image pushing ${{
+                steps.push.outcome == 'failure' && 'failed completely' || 'succeeded with some retries'
+              }}* in
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+
+              ${{ steps.push.outputs.push_failures && format(
+                '*Failed targets:*\n• {0}', join(fromJson(steps.push.outputs.push_failures), '\n• ')
+              ) || '' }}

From 6ee84d985a9cccc128aed7aceccde2a210c3110a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 4 Apr 2025 17:13:54 +0200
Subject: [PATCH 053/140] impr(perf tracing): ability to correlate with
 page_service logs (#11398)

# Problem

Current perf tracing fields do not allow answering the question what a
specific Postgres backend was waiting for.

# Background

For Pageserver logs, we set the backend PID as the libpq
`application_name` on the compute side, and funnel that into the a
tracing field for the spans that emit to the global tracing subscriber.

# Solution

Funnel `application_name`, and the other fields that we use in the
logging spans, into the root span for perf tracing.

# Refs

- fixes https://github.com/neondatabase/neon/issues/11393
- stacked atop https://github.com/neondatabase/neon/pull/11433
- epic: https://github.com/neondatabase/neon/issues/9873
---
 pageserver/src/page_service.rs | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f9bf45bb71..7e3991dbdc 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -247,6 +247,15 @@ pub async fn libpq_listener_main(
 
 type ConnectionHandlerResult = anyhow::Result<()>;
 
+/// Perf root spans start at the per-request level, after shard routing.
+/// This struct carries connection-level information to the root perf span definition.
+#[derive(Clone)]
+struct ConnectionPerfSpanFields {
+    peer_addr: String,
+    application_name: Option<String>,
+    compute_mode: Option<String>,
+}
+
 #[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
 #[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
@@ -271,6 +280,12 @@ async fn page_service_conn_main(
     let socket_fd = socket.as_raw_fd();
 
     let peer_addr = socket.peer_addr().context("get peer address")?;
+
+    let perf_span_fields = ConnectionPerfSpanFields {
+        peer_addr: peer_addr.to_string(),
+        application_name: None, // filled in later
+        compute_mode: None,     // filled in later
+    };
     tracing::Span::current().record("peer_addr", field::display(peer_addr));
 
     // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
@@ -314,6 +329,7 @@ async fn page_service_conn_main(
         tenant_manager,
         auth,
         pipelining_config,
+        perf_span_fields,
         connection_ctx,
         cancel.clone(),
         gate_guard,
@@ -358,6 +374,8 @@ struct PageServerHandler {
     /// `process_query` creates a child context from this one.
     connection_ctx: RequestContext,
 
+    perf_span_fields: ConnectionPerfSpanFields,
+
     cancel: CancellationToken,
 
     /// None only while pagestream protocol is being processed.
@@ -703,11 +721,13 @@ impl BatchedFeMessage {
 }
 
 impl PageServerHandler {
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
         pipelining_config: PageServicePipeliningConfig,
+        perf_span_fields: ConnectionPerfSpanFields,
         connection_ctx: RequestContext,
         cancel: CancellationToken,
         gate_guard: GateGuard,
@@ -717,6 +737,7 @@ impl PageServerHandler {
             auth,
             claims: None,
             connection_ctx,
+            perf_span_fields,
             timeline_handles: Some(TimelineHandles::new(tenant_manager)),
             cancel,
             pipelining_config,
@@ -754,6 +775,7 @@ impl PageServerHandler {
         tenant_id: TenantId,
         timeline_id: TimelineId,
         timeline_handles: &mut TimelineHandles,
+        conn_perf_span_fields: &ConnectionPerfSpanFields,
         cancel: &CancellationToken,
         ctx: &RequestContext,
         protocol_version: PagestreamProtocolVersion,
@@ -952,6 +974,9 @@ impl PageServerHandler {
                             info_span!(
                             target: PERF_TRACE_TARGET,
                             "GET_PAGE",
+                            peer_addr = conn_perf_span_fields.peer_addr,
+                            application_name = conn_perf_span_fields.application_name,
+                            compute_mode = conn_perf_span_fields.compute_mode,
                             tenant_id = %tenant_id,
                             shard_id = %shard.get_shard_identity().shard_slug(),
                             timeline_id = %timeline_id,
@@ -1581,6 +1606,7 @@ impl PageServerHandler {
                 tenant_id,
                 timeline_id,
                 &mut timeline_handles,
+                &self.perf_span_fields,
                 &cancel,
                 ctx,
                 protocol_version,
@@ -1714,6 +1740,8 @@ impl PageServerHandler {
         // Batcher
         //
 
+        let perf_span_fields = self.perf_span_fields.clone();
+
         let cancel_batcher = self.cancel.child_token();
         let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
         let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
@@ -1727,6 +1755,7 @@ impl PageServerHandler {
                         tenant_id,
                         timeline_id,
                         &mut timeline_handles,
+                        &perf_span_fields,
                         &cancel_batcher,
                         &ctx,
                         protocol_version,
@@ -2669,12 +2698,14 @@ where
 
         if let FeStartupPacket::StartupMessage { params, .. } = sm {
             if let Some(app_name) = params.get("application_name") {
+                self.perf_span_fields.application_name = Some(app_name.to_string());
                 Span::current().record("application_name", field::display(app_name));
             }
             if let Some(options) = params.get("options") {
                 let (config, _) = parse_options(options);
                 for (key, value) in config {
                     if key == "neon.compute_mode" {
+                        self.perf_span_fields.compute_mode = Some(value.clone());
                         Span::current().record("compute_mode", field::display(value));
                     }
                 }

From 4f94751b75b55b695688ae2a68688c925b2f6e92 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 4 Apr 2025 19:30:58 +0200
Subject: [PATCH 054/140] pageserver config: ignore+warn about unknown fields
 (instead of `deny_unknown_fields`) (#11275)

# Refs
- refs https://github.com/neondatabase/neon/issues/8915
- discussion thread:
https://neondb.slack.com/archives/C033RQ5SPDH/p1742406381132599
- stacked atop https://github.com/neondatabase/neon/pull/11298
- corresponding internal docs update that illustrates how this PR
removes friction: https://github.com/neondatabase/docs/pull/404

# Problem

Rejecting `pageserver.toml`s with unknown fields adds friction,
especially when using `pageserver.toml` fields as feature flags that
need to be decommissioned.

See the added paragraphs on `pageserver_api::models::ConfigToml` for
details on what kind of friction it causes.

Also read the corresponding internal docs update linked above to see a
more imperative guide for using `pageserver.toml` flags as feature
flags.

# Solution

## Ignoring unknown fields

Ignoring is the serde default behavior.

So, just remove `serde(deny_unknown_fields)` from all structs in
`pageserver_api::config::ConfigToml`
`pageserver_api::config::TenantConfigToml`.

I went through all the child fields and verified they don't use
`deny_unknown_fields` either, including those shared with
`pageserver_api::models`.

## Warning about unknown fields

We still want to warn about unknown fields to
- be informed about typos in the config template
- be reminded about feature-flag style configs that have been cleaned up
in code but not yet in config templates

We tried `serde_ignore` (cf draft #11319) but it doesn't work with
`serde(flatten)`.

The solution we arrived at is to compare the on-disk TOML with the TOML
that we produce if we serialize the `ConfigToml` again.
Any key specified in the on-disk TOML but not present in the serialized
TOML is flagged as an ignored key.
The mechanism to do it is a tiny recursive decent visitor on the
`toml_edit::DocumentMut`.

# Future Work

Invalid config _values_ in known fields will continue to fail pageserver
startup.
See
- https://github.com/neondatabase/cloud/issues/24349
for current worst case impact to deployments & ideas to improve.
---
 libs/pageserver_api/src/config.rs             |  53 +++++-
 libs/pageserver_api/src/models.rs             |   2 +-
 pageserver/src/bin/pageserver.rs              |  55 ++++--
 pageserver/src/config.rs                      |  81 +-------
 pageserver/src/config/ignored_fields.rs       | 179 ++++++++++++++++++
 pageserver/src/metrics.rs                     |  27 ++-
 test_runner/fixtures/neon_fixtures.py         |  15 +-
 test_runner/regress/test_compatibility.py     |   2 +-
 test_runner/regress/test_pageserver_config.py |  56 ++++++
 .../test_pageserver_getpage_throttle.py       |   4 +
 10 files changed, 372 insertions(+), 102 deletions(-)
 create mode 100644 pageserver/src/config/ignored_fields.rs
 create mode 100644 test_runner/regress/test_pageserver_config.py

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index d0225c8918..8f56d60a4a 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -51,9 +51,54 @@ pub struct NodeMetadata {
 /// If there cannot be a static default value because we need to make runtime
 /// checks to determine the default, make it an `Option` (which defaults to None).
 /// The runtime check should be done in the consuming crate, i.e., `pageserver`.
+///
+/// Unknown fields are silently ignored during deserialization.
+/// The alternative, which we used in the past, was to set `deny_unknown_fields`,
+/// which fails deserialization, and hence pageserver startup, if there is an unknown field.
+/// The reason we don't do that anymore is that it complicates
+/// usage of config fields for feature flagging, which we commonly do for
+/// region-by-region rollouts.
+/// The complications mainly arise because the `pageserver.toml` contents on a
+/// prod server have a separate lifecycle from the pageserver binary.
+/// For instance, `pageserver.toml` contents today are defined in the internal
+/// infra repo, and thus introducing a new config field to pageserver and
+/// rolling it out to prod servers are separate commits in separate repos
+/// that can't be made or rolled back atomically.
+/// Rollbacks in particular pose a risk with deny_unknown_fields because
+/// the old pageserver binary may reject a new config field, resulting in
+/// an outage unless the person doing the pageserver rollback remembers
+/// to also revert the commit that added the config field in to the
+/// `pageserver.toml` templates in the internal infra repo.
+/// (A pre-deploy config check would eliminate this risk during rollbacks,
+///  cf [here](https://github.com/neondatabase/cloud/issues/24349).)
+/// In addition to this compatibility problem during emergency rollbacks,
+/// deny_unknown_fields adds further complications when decomissioning a feature
+/// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`]
+/// until all prod servers' `pageserver.toml` files have been updated to a version
+/// that doesn't specify the flag. Otherwise new software would fail to start up.
+/// This adds the requirement for an intermediate step where the new config field
+/// is accepted but ignored, prolonging the decomissioning process by an entire
+/// release cycle.
+/// By contrast  with unknown fields silently ignored, decomissioning a feature
+/// flag is a one-step process: we can skip the intermediate step and straight
+/// remove the field from the [`ConfigToml`]. We leave the field in the
+/// `pageserver.toml` files on prod servers until we reach certainty that we
+/// will not roll back to old software whose behavior was dependent on config.
+/// Then we can remove the field from the templates in the internal infra repo.
+/// This process is [documented internally](
+/// https://docs.neon.build/storage/pageserver_configuration.html).
+///
+/// Note that above relaxed compatbility for the config format does NOT APPLY
+/// TO THE STORAGE FORMAT. As general guidance, when introducing storage format
+/// changes, ensure that the potential rollback target version will be compatible
+/// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`:
+/// any format version that exists in an environment must be compatible with the software that runs there.
+/// Use a pageserver.toml flag only to gate whether software _writes_ the new format.
+/// For more compatibility considerations, refer to [internal docs](
+/// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility)
 #[serde_as]
 #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
-#[serde(default, deny_unknown_fields)]
+#[serde(default)]
 pub struct ConfigToml {
     // types mapped 1:1 into the runtime PageServerConfig type
     pub listen_pg_addr: String,
@@ -138,7 +183,6 @@ pub struct ConfigToml {
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields)]
 pub struct DiskUsageEvictionTaskConfig {
     pub max_usage_pct: utils::serde_percent::Percent,
     pub min_avail_bytes: u64,
@@ -153,13 +197,11 @@ pub struct DiskUsageEvictionTaskConfig {
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
-#[serde(deny_unknown_fields)]
 pub enum PageServicePipeliningConfig {
     Serial,
     Pipelined(PageServicePipeliningConfigPipelined),
 }
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields)]
 pub struct PageServicePipeliningConfigPipelined {
     /// Causes runtime errors if larger than max get_vectored batch size.
     pub max_batch_size: NonZeroUsize,
@@ -175,7 +217,6 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
-#[serde(deny_unknown_fields)]
 pub enum GetVectoredConcurrentIo {
     /// The read path is fully sequential: layers are visited
     /// one after the other and IOs are issued and waited upon
@@ -294,7 +335,7 @@ pub struct MaxVectoredReadBytes(pub NonZeroUsize);
 
 /// Tenant-level configuration values, used for various purposes.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields, default)]
+#[serde(default)]
 pub struct TenantConfigToml {
     // Flush out an inmemory layer, if it's holding WAL older than this
     // This puts a backstop on how much WAL needs to be re-digested if the
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 16d9433973..bdee46f1b1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1104,7 +1104,7 @@ pub struct CompactionAlgorithmSettings {
 }
 
 #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+#[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum L0FlushConfig {
     #[serde(rename_all = "snake_case")]
     Direct { max_concurrency: NonZeroUsize },
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a575904efa..9a8494292d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -16,7 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
 use nix::sys::socket::{setsockopt, sockopt};
-use pageserver::config::{PageServerConf, PageserverIdentity};
+use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
@@ -98,7 +98,7 @@ fn main() -> anyhow::Result<()> {
     env::set_current_dir(&workdir)
         .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;
 
-    let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
+    let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
 
     // Initialize logging.
     //
@@ -144,7 +144,17 @@ fn main() -> anyhow::Result<()> {
         &[("node_id", &conf.id.to_string())],
     );
 
-    // after setting up logging, log the effective IO engine choice and read path implementations
+    // Warn about ignored config items; see pageserver_api::config::ConfigToml
+    // doc comment for rationale why we prefer this over serde(deny_unknown_fields).
+    {
+        let ignored_fields::Paths { paths } = &ignored;
+        for path in paths {
+            warn!(?path, "ignoring unknown configuration item");
+        }
+    }
+
+    // Log configuration items for feature-flag-like config
+    // (maybe we should automate this with a visitor?).
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
@@ -207,7 +217,7 @@ fn main() -> anyhow::Result<()> {
     tracing::info!("Initializing page_cache...");
     page_cache::init(conf.page_cache_size);
 
-    start_pageserver(launch_ts, conf, otel_guard).context("Failed to start pageserver")?;
+    start_pageserver(launch_ts, conf, ignored, otel_guard).context("Failed to start pageserver")?;
 
     scenario.teardown();
     Ok(())
@@ -217,7 +227,7 @@ fn initialize_config(
     identity_file_path: &Utf8Path,
     cfg_file_path: &Utf8Path,
     workdir: &Utf8Path,
-) -> anyhow::Result<&'static PageServerConf> {
+) -> anyhow::Result<(&'static PageServerConf, ignored_fields::Paths)> {
     // The deployment orchestrator writes out an indentity file containing the node id
     // for all pageservers. This file is the source of truth for the node id. In order
     // to allow for rolling back pageserver releases, the node id is also included in
@@ -246,16 +256,36 @@ fn initialize_config(
 
     let config_file_contents =
         std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
-    let config_toml = serde_path_to_error::deserialize(
-        toml_edit::de::Deserializer::from_str(&config_file_contents)
-            .context("build toml deserializer")?,
-    )
-    .context("deserialize config toml")?;
 
+    // Deserialize the config file contents into a ConfigToml.
+    let config_toml: pageserver_api::config::ConfigToml = {
+        let deserializer = toml_edit::de::Deserializer::from_str(&config_file_contents)
+            .context("build toml deserializer")?;
+        let mut path_to_error_track = serde_path_to_error::Track::new();
+        let deserializer =
+            serde_path_to_error::Deserializer::new(deserializer, &mut path_to_error_track);
+        serde::Deserialize::deserialize(deserializer).context("deserialize config toml")?
+    };
+
+    // Find unknown fields by re-serializing the parsed ConfigToml and comparing it to the on-disk file.
+    // Any fields that are only in the on-disk version are unknown.
+    // (The assumption here is that the ConfigToml doesn't to skip_serializing_if.)
+    // (Make sure to read the ConfigToml doc comment on why we only want to warn about, but not fail startup, on unknown fields).
+    let ignored = {
+        let ondisk_toml = config_file_contents
+            .parse::<toml_edit::DocumentMut>()
+            .context("parse original config as toml document")?;
+        let parsed_toml = toml_edit::ser::to_document(&config_toml)
+            .context("re-serialize config to toml document")?;
+        pageserver::config::ignored_fields::find(ondisk_toml, parsed_toml)
+    };
+
+    // Construct the runtime god object (it's called PageServerConf but actually is just global shared state).
     let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
         .context("runtime-validation of config toml")?;
+    let conf = Box::leak(Box::new(conf));
 
-    Ok(Box::leak(Box::new(conf)))
+    Ok((conf, ignored))
 }
 
 struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -306,6 +336,7 @@ fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
 fn start_pageserver(
     launch_ts: &'static LaunchTimestamp,
     conf: &'static PageServerConf,
+    ignored: ignored_fields::Paths,
     otel_guard: Option<OtelGuard>,
 ) -> anyhow::Result<()> {
     // Monotonic time for later calculating startup duration
@@ -329,7 +360,7 @@ fn start_pageserver(
         pageserver::metrics::tokio_epoll_uring::Collector::new(),
     ))
     .unwrap();
-    pageserver::preinitialize_metrics(conf);
+    pageserver::preinitialize_metrics(conf, ignored);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index d9a5f8c381..ccc29e59d4 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -4,6 +4,8 @@
 //! file, or on the command line.
 //! See also `settings.md` for better description on every parameter.
 
+pub mod ignored_fields;
+
 use std::env;
 use std::num::NonZeroUsize;
 use std::sync::Arc;
@@ -560,7 +562,6 @@ impl PageServerConf {
 }
 
 #[derive(serde::Deserialize, serde::Serialize)]
-#[serde(deny_unknown_fields)]
 pub struct PageserverIdentity {
     pub id: NodeId,
 }
@@ -632,82 +633,4 @@ mod tests {
         PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
             .expect("parse_and_validate");
     }
-
-    /// If there's a typo in the pageserver config, we'd rather catch that typo
-    /// and fail pageserver startup than silently ignoring the typo, leaving whoever
-    /// made it in the believe that their config change is effective.
-    ///
-    /// The default in serde is to allow unknown fields, so, we rely
-    /// on developer+review discipline to add `deny_unknown_fields` when adding
-    /// new structs to the config, and these tests here as a regression test.
-    ///
-    /// The alternative to all of this would be to allow unknown fields in the config.
-    /// To catch them, we could have a config check tool or mgmt API endpoint that
-    /// compares the effective config with the TOML on disk and makes sure that
-    /// the on-disk TOML is a strict subset of the effective config.
-    mod unknown_fields_handling {
-        macro_rules! test {
-            ($short_name:ident, $input:expr) => {
-                #[test]
-                fn $short_name() {
-                    let input = $input;
-                    let err = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
-                        .expect_err("some_invalid_field is an invalid field");
-                    dbg!(&err);
-                    assert!(err.to_string().contains("some_invalid_field"));
-                }
-            };
-        }
-        use indoc::indoc;
-
-        test!(
-            toplevel,
-            indoc! {r#"
-                some_invalid_field = 23
-            "#}
-        );
-
-        test!(
-            toplevel_nested,
-            indoc! {r#"
-                [some_invalid_field]
-                foo = 23
-            "#}
-        );
-
-        test!(
-            disk_usage_based_eviction,
-            indoc! {r#"
-                [disk_usage_based_eviction]
-                some_invalid_field = 23
-            "#}
-        );
-
-        test!(
-            tenant_config,
-            indoc! {r#"
-                [tenant_config]
-                some_invalid_field = 23
-            "#}
-        );
-
-        test!(
-            l0_flush,
-            indoc! {r#"
-                [l0_flush]
-                mode = "direct"
-                some_invalid_field = 23
-            "#}
-        );
-
-        // TODO: fix this => https://github.com/neondatabase/neon/issues/8915
-        // test!(
-        //     remote_storage_config,
-        //     indoc! {r#"
-        //         [remote_storage_config]
-        //         local_path = "/nonexistent"
-        //         some_invalid_field = 23
-        //     "#}
-        // );
-    }
 }
diff --git a/pageserver/src/config/ignored_fields.rs b/pageserver/src/config/ignored_fields.rs
new file mode 100644
index 0000000000..68d0823604
--- /dev/null
+++ b/pageserver/src/config/ignored_fields.rs
@@ -0,0 +1,179 @@
+//! Check for fields in the on-disk config file that were ignored when
+//! deserializing [`pageserver_api::config::ConfigToml`].
+//!
+//! This could have been part of the [`pageserver_api::config`] module,
+//! but the way we identify unused fields in this module
+//! is specific to the format (TOML) and the implementation of the
+//! deserialization for that format ([`toml_edit`]).
+
+use std::collections::HashSet;
+
+use itertools::Itertools;
+
+/// Pass in the user-specified config and the re-serialized [`pageserver_api::config::ConfigToml`].
+/// The returned [`Paths`] contains the paths to the fields that were ignored by deserialization
+/// of the [`pageserver_api::config::ConfigToml`].
+pub fn find(user_specified: toml_edit::DocumentMut, reserialized: toml_edit::DocumentMut) -> Paths {
+    let user_specified = paths(user_specified);
+    let reserialized = paths(reserialized);
+    fn paths(doc: toml_edit::DocumentMut) -> HashSet<String> {
+        let mut out = Vec::new();
+        let mut visitor = PathsVisitor::new(&mut out);
+        visitor.visit_table_like(doc.as_table());
+        HashSet::from_iter(out)
+    }
+
+    let mut ignored = HashSet::new();
+
+    // O(n) because of HashSet
+    for path in user_specified {
+        if !reserialized.contains(&path) {
+            ignored.insert(path);
+        }
+    }
+
+    Paths {
+        paths: ignored
+            .into_iter()
+            // sort lexicographically for deterministic output
+            .sorted()
+            .collect(),
+    }
+}
+
+pub struct Paths {
+    pub paths: Vec<String>,
+}
+
+struct PathsVisitor<'a> {
+    stack: Vec<String>,
+    out: &'a mut Vec<String>,
+}
+
+impl<'a> PathsVisitor<'a> {
+    fn new(out: &'a mut Vec<String>) -> Self {
+        Self {
+            stack: Vec::new(),
+            out,
+        }
+    }
+
+    fn visit_table_like(&mut self, table_like: &dyn toml_edit::TableLike) {
+        for (entry, item) in table_like.iter() {
+            self.stack.push(entry.to_string());
+            self.visit_item(item);
+            self.stack.pop();
+        }
+    }
+
+    fn visit_item(&mut self, item: &toml_edit::Item) {
+        match item {
+            toml_edit::Item::None => (),
+            toml_edit::Item::Value(value) => self.visit_value(value),
+            toml_edit::Item::Table(table) => {
+                self.visit_table_like(table);
+            }
+            toml_edit::Item::ArrayOfTables(array_of_tables) => {
+                for (i, table) in array_of_tables.iter().enumerate() {
+                    self.stack.push(format!("[{i}]"));
+                    self.visit_table_like(table);
+                    self.stack.pop();
+                }
+            }
+        }
+    }
+
+    fn visit_value(&mut self, value: &toml_edit::Value) {
+        match value {
+            toml_edit::Value::String(_)
+            | toml_edit::Value::Integer(_)
+            | toml_edit::Value::Float(_)
+            | toml_edit::Value::Boolean(_)
+            | toml_edit::Value::Datetime(_) => self.out.push(self.stack.join(".")),
+            toml_edit::Value::Array(array) => {
+                for (i, value) in array.iter().enumerate() {
+                    self.stack.push(format!("[{i}]"));
+                    self.visit_value(value);
+                    self.stack.pop();
+                }
+            }
+            toml_edit::Value::InlineTable(inline_table) => self.visit_table_like(inline_table),
+        }
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+
+    fn test_impl(original: &str, parsed: &str, expect: [&str; 1]) {
+        let original: toml_edit::DocumentMut = original.parse().expect("parse original config");
+        let parsed: toml_edit::DocumentMut = parsed.parse().expect("parse re-serialized config");
+
+        let super::Paths { paths: actual } = super::find(original, parsed);
+        assert_eq!(actual, &expect);
+    }
+
+    #[test]
+    fn top_level() {
+        test_impl(
+            r#"
+                [a]
+                b = 1
+                c = 2
+                d = 3
+            "#,
+            r#"
+                [a]
+                b = 1
+                c = 2
+            "#,
+            ["a.d"],
+        );
+    }
+
+    #[test]
+    fn nested() {
+        test_impl(
+            r#"
+                [a.b.c]
+                d = 23
+            "#,
+            r#"
+                [a]
+                e = 42
+            "#,
+            ["a.b.c.d"],
+        );
+    }
+
+    #[test]
+    fn array_of_tables() {
+        test_impl(
+            r#"
+                [[a]]
+                b = 1
+                c = 2
+                d = 3
+            "#,
+            r#"
+                [[a]]
+                b = 1
+                c = 2
+            "#,
+            ["a.[0].d"],
+        );
+    }
+
+    #[test]
+    fn array() {
+        test_impl(
+            r#"
+            foo = [ {bar = 23} ]
+            "#,
+            r#"
+            foo = [ { blup = 42 }]
+            "#,
+            ["foo.[0].bar"],
+        );
+    }
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d8497288ca..0c5d8fed0b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -30,6 +30,7 @@ use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use utils::id::TimelineId;
 
+use crate::config;
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::pgdatadir_mapping::DatadirModificationStats;
@@ -4107,9 +4108,33 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
         .set(u64::try_from(num_threads.get()).unwrap());
 }
 
-pub fn preinitialize_metrics(conf: &'static PageServerConf) {
+static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_config_ignored_items",
+        "TOML items present in the on-disk configuration file but ignored by the pageserver config parser.\
+         The `item` label is the dot-separated path of the ignored item in the on-disk configuration file.\
+         The value for an unknown config item is always 1.\
+         There is a special label value \"\", which is 0, so that there is always a metric exposed (simplifies dashboards).",
+        &["item"]
+    )
+    .unwrap()
+});
+
+pub fn preinitialize_metrics(
+    conf: &'static PageServerConf,
+    ignored: config::ignored_fields::Paths,
+) {
     set_page_service_config_max_batch_size(&conf.page_service_pipelining);
 
+    PAGESERVER_CONFIG_IGNORED_ITEMS
+        .with_label_values(&[""])
+        .set(0);
+    for path in &ignored.paths {
+        PAGESERVER_CONFIG_IGNORED_ITEMS
+            .with_label_values(&[path])
+            .set(1);
+    }
+
     // Python tests need these and on some we do alerting.
     //
     // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 86b6043552..5694bf170e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1297,9 +1297,20 @@ class NeonEnv:
                 ps_cfg[key] = value
 
             # Create a corresponding NeonPageserver object
-            self.pageservers.append(
-                NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"])
+            ps = NeonPageserver(
+                self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"]
             )
+
+            if config.test_may_use_compatibility_snapshot_binaries:
+                # New features gated by pageserver config usually get rolled out in the
+                # test suite first, by enabling it in the `ps_cfg` abve.
+                # Compatibility tests run with old binaries that predate feature code & config.
+                # So, old binaries will warn about the flag's presence.
+                # Silence those warnings categorically.
+                log.info("test may use old binaries, ignoring warnings about unknown config items")
+                ps.allowed_errors.append(".*ignoring unknown configuration item.*")
+
+            self.pageservers.append(ps)
             cfg["pageservers"].append(ps_cfg)
 
         # Create config and a Safekeeper object for each safekeeper
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index fcc2e7006f..ee96daca33 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -101,7 +101,7 @@ if TYPE_CHECKING:
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
 #    export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
-#    export NEON_BIN=target/release
+#    export NEON_BIN=target/${BUILD_TYPE}
 #    export POSTGRES_DISTRIB_DIR=pg_install
 #
 #    # Build previous version of binaries and store them somewhere:
diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py
new file mode 100644
index 0000000000..4035afd9aa
--- /dev/null
+++ b/test_runner/regress/test_pageserver_config.py
@@ -0,0 +1,56 @@
+import re
+
+import pytest
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import run_only_on_default_postgres
+
+
+@pytest.mark.parametrize("what", ["default", "top_level", "nested"])
+@run_only_on_default_postgres(reason="does not use postgres")
+def test_unknown_config_items_handling(neon_simple_env: NeonEnv, what: str):
+    """
+    Ensure we log unknown config fields and expose a metric for alerting.
+    There are more unit tests in the Rust code for other TOML items.
+    """
+    env = neon_simple_env
+
+    def edit_fn(config) -> str | None:
+        if what == "default":
+            return None
+        elif what == "top_level":
+            config["unknown_top_level_config_item"] = 23
+            return r"unknown_top_level_config_item"
+        elif what == "nested":
+            config["remote_storage"]["unknown_config_item"] = 23
+            return r"remote_storage.unknown_config_item"
+        else:
+            raise ValueError(f"Unknown what: {what}")
+
+    def get_metric():
+        metrics = env.pageserver.http_client().get_metrics()
+        samples = metrics.query_all("pageserver_config_ignored_items")
+        by_item = {sample.labels["item"]: sample.value for sample in samples}
+        assert by_item[""] == 0, "must always contain the empty item with value 0"
+        del by_item[""]
+        return by_item
+
+    expected_ignored_item = env.pageserver.edit_config_toml(edit_fn)
+
+    if expected_ignored_item is not None:
+        expected_ignored_item_log_line_re = r".*ignoring unknown configuration item.*" + re.escape(
+            expected_ignored_item
+        )
+        env.pageserver.allowed_errors.append(expected_ignored_item_log_line_re)
+
+    if expected_ignored_item is not None:
+        assert not env.pageserver.log_contains(expected_ignored_item_log_line_re)
+        assert get_metric() == {}
+
+    # in any way, unknown config items should not fail pageserver to start
+    # TODO: extend this test with the config validator mode once we introduce it
+    # https://github.com/neondatabase/cloud/issues/24349
+    env.pageserver.restart()
+
+    if expected_ignored_item is not None:
+        assert env.pageserver.log_contains(expected_ignored_item_log_line_re)
+        assert get_metric() == {expected_ignored_item: 1}
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index e84876651c..3d7204d883 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -195,3 +195,7 @@ def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
     ps_http = env.pageserver.http_client()
     conf = ps_http.tenant_config(env.initial_tenant)
     assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
+
+    env.pageserver.allowed_errors.append(
+        r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
+    )

From aad410c8f13d0b28d705658e11057c1b1f49d9ed Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 4 Apr 2025 20:04:39 +0200
Subject: [PATCH 055/140] improve ondemand-download latency observability
 (#11421)

## Problem

We don't have metrics to exactly quantify the end user impact of
on-demand downloads.

Perf tracing is underway (#11140) to supply us with high-resolution
*samples*.

But it will also be useful to have some aggregate per-timeline and
per-instance metrics that definitively contain all observations.

## Summary of changes

This PR consists of independent commits that should be reviewed
independently.

However, for convenience, we're going to merge them together.

- refactor(metrics): measure_remote_op can use async traits
- impr(pageserver metrics): task_kind dimension for
remote_timeline_client latency histo
  - implements https://github.com/neondatabase/cloud/issues/26800
- refs
https://github.com/neondatabase/cloud/issues/26193#issuecomment-2769705793
- use the opportunity to rename the metric and add a _global suffix;
checked grafana export, it's only used in two personal dashboards, one
of them mine, the other by Heikki
- log on-demand download latency for expensive-to-query but precise
ground truth
- metric for wall clock time spent waiting for on-demand downloads

## Refs

- refs https://github.com/neondatabase/cloud/issues/26800
- a bunch of minor investigations / incidents into latency outliers
---
 libs/utils/src/elapsed_accum.rs               |  26 +++
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/sync/heavier_once_cell.rs      |  18 +-
 pageserver/src/context.rs                     |  30 ++-
 pageserver/src/metrics.rs                     | 187 +++++++++++++-----
 .../src/tenant/remote_timeline_client.rs      |   4 +
 pageserver/src/tenant/storage_layer/layer.rs  |  18 +-
 test_runner/fixtures/metrics.py               |  24 ++-
 test_runner/regress/test_gc_aggressive.py     |   2 +-
 test_runner/regress/test_ondemand_download.py |   3 +-
 .../test_pageserver_metric_collection.py      |   2 +-
 11 files changed, 247 insertions(+), 69 deletions(-)
 create mode 100644 libs/utils/src/elapsed_accum.rs

diff --git a/libs/utils/src/elapsed_accum.rs b/libs/utils/src/elapsed_accum.rs
new file mode 100644
index 0000000000..efb2a34a95
--- /dev/null
+++ b/libs/utils/src/elapsed_accum.rs
@@ -0,0 +1,26 @@
+use std::time::{Duration, Instant};
+
+#[derive(Default)]
+pub struct ElapsedAccum {
+    accum: Duration,
+}
+
+impl ElapsedAccum {
+    pub fn get(&self) -> Duration {
+        self.accum
+    }
+    pub fn guard(&mut self) -> impl Drop + '_ {
+        let start = Instant::now();
+        scopeguard::guard(start, |last_wait_at| {
+            self.accum += Instant::now() - last_wait_at;
+        })
+    }
+
+    pub async fn measure<Fut, O>(&mut self, fut: Fut) -> O
+    where
+        Fut: Future<Output = O>,
+    {
+        let _guard = self.guard();
+        fut.await
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 9389a27bf3..206b8bbd8f 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -93,6 +93,8 @@ pub mod try_rcu;
 
 pub mod guard_arc_swap;
 
+pub mod elapsed_accum;
+
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;
 
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 8f8401b35d..5fb4c5b460 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -111,9 +111,17 @@ impl<T> OnceCell<T> {
         }
     }
 
+    /// Like [`Self::get_or_init_detached_measured`], but without out parameter for time spent waiting.
+    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
+        self.get_or_init_detached_measured(None).await
+    }
+
     /// Returns a guard to an existing initialized value, or returns an unique initialization
     /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
-    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
+    pub async fn get_or_init_detached_measured(
+        &self,
+        mut wait_time: Option<&mut crate::elapsed_accum::ElapsedAccum>,
+    ) -> Result<Guard<'_, T>, InitPermit> {
         // It looks like OnceCell::get_or_init could be implemented using this method instead of
         // duplication. However, that makes the future be !Send due to possibly holding on to the
         // MutexGuard over an await point.
@@ -125,12 +133,16 @@ impl<T> OnceCell<T> {
                 }
                 guard.init_semaphore.clone()
             };
-
             {
                 let permit = {
                     // increment the count for the duration of queued
                     let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
+                    let fut = sem.acquire();
+                    if let Some(wait_time) = wait_time.as_mut() {
+                        wait_time.measure(fut).await
+                    } else {
+                        fut.await
+                    }
                 };
 
                 let Ok(permit) = permit else {
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 481fdb4ea2..04dcca4299 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -89,7 +89,7 @@
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
 
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 
 use once_cell::sync::Lazy;
 use tracing::warn;
@@ -566,6 +566,34 @@ impl RequestContext {
         }
     }
 
+    pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
+        if duration == Duration::ZERO {
+            return;
+        }
+
+        match &self.scope {
+            Scope::Timeline { arc_arc } => arc_arc
+                .wait_ondemand_download_time
+                .observe(self.task_kind, duration),
+            _ => {
+                use once_cell::sync::Lazy;
+                use std::sync::Mutex;
+                use std::time::Duration;
+                use utils::rate_limit::RateLimit;
+                static LIMIT: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
+                let mut guard = LIMIT.lock().unwrap();
+                guard.call2(|rate_limit_stats| {
+                    warn!(
+                        %rate_limit_stats,
+                        backtrace=%std::backtrace::Backtrace::force_capture(),
+                        "ondemand downloads should always happen within timeline scope",
+                    );
+                });
+            }
+        }
+    }
+
     pub(crate) fn perf_follows_from(&self, from: &RequestContext) {
         if let (Some(span), Some(from_span)) = (&self.perf_span, &from.perf_span) {
             span.inner().follows_from(from_span.inner());
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 0c5d8fed0b..1fe51021fd 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,10 +1,8 @@
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::os::fd::RawFd;
-use std::pin::Pin;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
-use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
 
 use enum_map::{Enum as _, EnumMap};
@@ -23,7 +21,6 @@ use pageserver_api::config::{
 };
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use pin_project_lite::pin_project;
 use postgres_backend::{QueryError, is_expected_io_error};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
@@ -500,6 +497,100 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) mod wait_ondemand_download_time {
+    use super::*;
+    const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
+        0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms
+        0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s
+        1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s
+        10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m
+    ];
+
+    /// The task kinds for which we want to track wait times for on-demand downloads.
+    /// Other task kinds' wait times are accumulated in label value `unknown`.
+    pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [
+        TaskKind::PageRequestHandler,
+        TaskKind::WalReceiverConnectionHandler,
+    ];
+
+    pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy<Vec<Histogram>> = Lazy::new(|| {
+        let histo = register_histogram_vec!(
+            "pageserver_wait_ondemand_download_seconds_global",
+            "Observations are individual tasks' wait times for on-demand downloads. \
+         If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.",
+            &["task_kind"],
+            WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(),
+        )
+        .expect("failed to define a metric");
+        WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
+            .iter()
+            .map(|task_kind| histo.with_label_values(&[task_kind.into()]))
+            .collect::<Vec<_>>()
+    });
+
+    pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy<CounterVec> = Lazy::new(|| {
+        register_counter_vec!(
+            // use a name that _could_ be evolved into a per-timeline histogram later
+            "pageserver_wait_ondemand_download_seconds_sum",
+            "Like `pageserver_wait_ondemand_download_seconds_global` but per timeline",
+            &["tenant_id", "shard_id", "timeline_id", "task_kind"],
+        )
+        .unwrap()
+    });
+
+    pub struct WaitOndemandDownloadTimeSum {
+        counters: [Counter; WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS.len()],
+    }
+
+    impl WaitOndemandDownloadTimeSum {
+        pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
+            let counters = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
+                .iter()
+                .map(|task_kind| {
+                    WAIT_ONDEMAND_DOWNLOAD_TIME_SUM
+                        .get_metric_with_label_values(&[
+                            tenant_id,
+                            shard_id,
+                            timeline_id,
+                            task_kind.into(),
+                        ])
+                        .unwrap()
+                })
+                .collect::<Vec<_>>();
+            Self {
+                counters: counters.try_into().unwrap(),
+            }
+        }
+        pub(crate) fn observe(&self, task_kind: TaskKind, duration: Duration) {
+            let maybe = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
+                .iter()
+                .enumerate()
+                .find(|(_, kind)| **kind == task_kind);
+            let Some((idx, _)) = maybe else {
+                return;
+            };
+            WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL[idx].observe(duration.as_secs_f64());
+            let counter = &self.counters[idx];
+            counter.inc_by(duration.as_secs_f64());
+        }
+    }
+
+    pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) {
+        for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS {
+            let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+                task_kind.into(),
+            ]);
+        }
+    }
+
+    pub(crate) fn preinitialize_global_metrics() {
+        Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL);
+    }
+}
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2315,13 +2406,18 @@ impl RemoteOpFileKind {
     }
 }
 
-pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
-        "pageserver_remote_operation_seconds",
-        "Time spent on remote storage operations. \
-        Grouped by tenant, timeline, operation_kind and status. \
+        "pageserver_remote_timeline_client_seconds_global",
+        "Time spent on remote timeline client operations. \
+        Grouped by task_kind, file_kind, operation_kind and status. \
+        The task_kind is \
+          - for layer downloads, populated from RequestContext (primary objective of having the label) \
+          - for index downloads, set to 'unknown' \
+          - for any upload operation, set to 'RemoteUploadTask' \
+        This keeps dimensionality at bay. \
         Does not account for time spent waiting in remote timeline client's queues.",
-        &["file_kind", "op_kind", "status"]
+        &["task_kind", "file_kind", "op_kind", "status"]
     )
     .expect("failed to define a metric")
 });
@@ -2883,6 +2979,7 @@ pub(crate) struct TimelineMetrics {
     pub storage_io_size: StorageIoSizeMetrics,
     pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter,
     pub wait_lsn_start_finish_counterpair: IntCounterPair,
+    pub wait_ondemand_download_time: wait_ondemand_download_time::WaitOndemandDownloadTimeSum,
     shutdown: std::sync::atomic::AtomicBool,
 }
 
@@ -3028,6 +3125,13 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let wait_ondemand_download_time =
+            wait_ondemand_download_time::WaitOndemandDownloadTimeSum::new(
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+            );
+
         TimelineMetrics {
             tenant_id,
             shard_id,
@@ -3061,6 +3165,7 @@ impl TimelineMetrics {
             wal_records_received,
             wait_lsn_in_progress_micros,
             wait_lsn_start_finish_counterpair,
+            wait_ondemand_download_time,
             shutdown: std::sync::atomic::AtomicBool::default(),
         }
     }
@@ -3253,6 +3358,8 @@ impl TimelineMetrics {
                 .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]);
         }
 
+        wait_ondemand_download_time::shutdown_timeline(tenant_id, shard_id, timeline_id);
+
         let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
             SmgrQueryType::GetPageAtLsn.into(),
             tenant_id,
@@ -3374,13 +3481,18 @@ impl RemoteTimelineClientMetrics {
 
     pub fn remote_operation_time(
         &self,
+        task_kind: Option<TaskKind>,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
         status: &'static str,
     ) -> Histogram {
-        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        REMOTE_OPERATION_TIME
-            .get_metric_with_label_values(&[key.0, key.1, key.2])
+        REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY
+            .get_metric_with_label_values(&[
+                task_kind.as_ref().map(|tk| tk.into()).unwrap_or("unknown"),
+                file_kind.as_str(),
+                op_kind.as_str(),
+                status,
+            ])
             .unwrap()
     }
 
@@ -3625,54 +3737,26 @@ impl Drop for RemoteTimelineClientMetrics {
 
 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub(crate) trait MeasureRemoteOp: Sized {
-    fn measure_remote_op(
+pub(crate) trait MeasureRemoteOp<O, E>: Sized + Future<Output = Result<O, E>> {
+    async fn measure_remote_op(
         self,
+        task_kind: Option<TaskKind>, // not all caller contexts have a RequestContext / TaskKind handy
         file_kind: RemoteOpFileKind,
         op: RemoteOpKind,
         metrics: Arc<RemoteTimelineClientMetrics>,
-    ) -> MeasuredRemoteOp<Self> {
+    ) -> Result<O, E> {
         let start = Instant::now();
-        MeasuredRemoteOp {
-            inner: self,
-            file_kind,
-            op,
-            start,
-            metrics,
-        }
+        let res = self.await;
+        let duration = start.elapsed();
+        let status = if res.is_ok() { &"success" } else { &"failure" };
+        metrics
+            .remote_operation_time(task_kind, &file_kind, &op, status)
+            .observe(duration.as_secs_f64());
+        res
     }
 }
 
-impl<T: Sized> MeasureRemoteOp for T {}
-
-pin_project! {
-    pub(crate) struct MeasuredRemoteOp<F>
-    {
-        #[pin]
-        inner: F,
-        file_kind: RemoteOpFileKind,
-        op: RemoteOpKind,
-        start: Instant,
-        metrics: Arc<RemoteTimelineClientMetrics>,
-    }
-}
-
-impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
-    type Output = Result<O, E>;
-
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        let this = self.project();
-        let poll_result = this.inner.poll(cx);
-        if let Poll::Ready(ref res) = poll_result {
-            let duration = this.start.elapsed();
-            let status = if res.is_ok() { &"success" } else { &"failure" };
-            this.metrics
-                .remote_operation_time(this.file_kind, this.op, status)
-                .observe(duration.as_secs_f64());
-        }
-        poll_result
-    }
-}
+impl<Fut, O, E> MeasureRemoteOp<O, E> for Fut where Fut: Sized + Future<Output = Result<O, E>> {}
 
 pub mod tokio_epoll_uring {
     use std::collections::HashMap;
@@ -4220,4 +4304,5 @@ pub fn preinitialize_metrics(
     Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
 
     tenant_throttling::preinitialize_global_metrics();
+    wait_ondemand_download_time::preinitialize_global_metrics();
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 579dbeb322..10a13ef1a2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -642,6 +642,7 @@ impl RemoteTimelineClient {
             cancel,
         )
         .measure_remote_op(
+            Option::<TaskKind>::None,
             RemoteOpFileKind::Index,
             RemoteOpKind::Download,
             Arc::clone(&self.metrics),
@@ -739,6 +740,7 @@ impl RemoteTimelineClient {
                 ctx,
             )
             .measure_remote_op(
+                Some(ctx.task_kind()),
                 RemoteOpFileKind::Layer,
                 RemoteOpKind::Download,
                 Arc::clone(&self.metrics),
@@ -2175,6 +2177,7 @@ impl RemoteTimelineClient {
                         &self.cancel,
                     )
                     .measure_remote_op(
+                        Some(TaskKind::RemoteUploadTask),
                         RemoteOpFileKind::Layer,
                         RemoteOpKind::Upload,
                         Arc::clone(&self.metrics),
@@ -2191,6 +2194,7 @@ impl RemoteTimelineClient {
                         &self.cancel,
                     )
                     .measure_remote_op(
+                        Some(TaskKind::RemoteUploadTask),
                         RemoteOpFileKind::Index,
                         RemoteOpKind::Upload,
                         Arc::clone(&self.metrics),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 39665d2cc2..b7f6e5dc77 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -975,6 +975,10 @@ impl LayerInner {
         allow_download: bool,
         ctx: &RequestContext,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
+        let mut wait_for_download_recorder =
+            scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| {
+                ctx.ondemand_download_wait_observe(accum.get());
+            });
         let (weak, permit) = {
             // get_or_init_detached can:
             // - be fast (mutex lock) OR uncontested semaphore permit acquire
@@ -983,7 +987,7 @@ impl LayerInner {
 
             let locked = self
                 .inner
-                .get_or_init_detached()
+                .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
                 .await
                 .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
 
@@ -1013,6 +1017,7 @@ impl LayerInner {
                 Err(permit) => (None, permit),
             }
         };
+        let _guard = wait_for_download_recorder.guard();
 
         if let Some(weak) = weak {
             // only drop the weak after dropping the heavier_once_cell guard
@@ -1202,6 +1207,7 @@ impl LayerInner {
         permit: heavier_once_cell::InitPermit,
         ctx: &RequestContext,
     ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
+        let start = std::time::Instant::now();
         let result = timeline
             .remote_client
             .download_layer_file(
@@ -1213,7 +1219,8 @@ impl LayerInner {
                 ctx,
             )
             .await;
-
+        let latency = start.elapsed();
+        let latency_millis = u64::try_from(latency.as_millis()).unwrap();
         match result {
             Ok(size) => {
                 assert_eq!(size, self.desc.file_size);
@@ -1229,9 +1236,8 @@ impl LayerInner {
                     Err(e) => {
                         panic!("post-condition failed: needs_download errored: {e:?}");
                     }
-                }
-
-                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
+                };
+                tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
                 timeline
                     .metrics
                     .resident_physical_size_add(self.desc.file_size);
@@ -1260,7 +1266,7 @@ impl LayerInner {
                     return Err(e);
                 }
 
-                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
+                tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}");
 
                 let backoff = utils::backoff::exponential_backoff_duration_seconds(
                     consecutive_failures.min(u32::MAX as usize) as u32,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 106a588711..df500544dc 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from collections import defaultdict
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from prometheus_client.parser import text_string_to_metric_families
 
@@ -46,14 +46,26 @@ class MetricsGetter:
     def get_metrics(self) -> Metrics:
         raise NotImplementedError()
 
-    def get_metric_value(self, name: str, filter: dict[str, str] | None = None) -> float | None:
+    def get_metric_value(
+        self,
+        name: str,
+        filter: dict[str, str] | None = None,
+        aggregate: Literal["sum"] | None = None,
+    ) -> float | None:
         metrics = self.get_metrics()
         results = metrics.query_all(name, filter=filter)
         if not results:
             log.info(f'could not find metric "{name}"')
             return None
-        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
-        return results[0].value
+        if aggregate is None:
+            assert len(results) == 1, (
+                f"metric {name} with given filters is not unique, got: {results}"
+            )
+            return results[0].value
+        elif aggregate == "sum":
+            return sum(sample.value for sample in results)
+        else:
+            raise RuntimeError(f"unknown aggregate function {aggregate}")
 
     def get_metrics_values(
         self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False
@@ -132,7 +144,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
     *histogram("pageserver_wait_lsn_seconds"),
-    *histogram("pageserver_remote_operation_seconds"),
+    *histogram("pageserver_remote_timeline_client_seconds_global"),
     *histogram("pageserver_io_operations_seconds"),
     "pageserver_smgr_query_started_global_count_total",
     "pageserver_tenant_states_count",
@@ -143,6 +155,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
     counter("pageserver_tenant_throttling_count_global"),
     *histogram("pageserver_tokio_epoll_uring_slots_submission_queue_depth"),
+    *histogram("pageserver_wait_ondemand_download_seconds_global"),
 )
 
 PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
@@ -180,6 +193,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_wait_lsn_in_progress_micros"),
     counter("pageserver_wait_lsn_started_count"),
     counter("pageserver_wait_lsn_finished_count"),
+    counter("pageserver_wait_ondemand_download_seconds_sum"),
     *histogram("pageserver_page_service_batch_size"),
     *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 4c196a099b..c83004583a 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -126,7 +126,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
         ps_metrics = env.pageserver.http_client().get_metrics()
         total = 0.0
         for sample in ps_metrics.query_all(
-            name="pageserver_remote_operation_seconds_count",
+            name="pageserver_remote_timeline_client_seconds_global_count",
             filter={
                 "file_kind": str(file_kind),
                 "op_kind": str(op_kind),
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index b292d08b60..2590a3fe9d 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -38,12 +38,13 @@ def get_num_downloaded_layers(client: PageserverHttpClient):
     This assumes that the pageserver only has a single tenant.
     """
     value = client.get_metric_value(
-        "pageserver_remote_operation_seconds_count",
+        "pageserver_remote_timeline_client_seconds_global_count",
         {
             "file_kind": "layer",
             "op_kind": "download",
             "status": "success",
         },
+        "sum",
     )
     if value is None:
         return 0
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index f80edced5c..acec0ba44a 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -107,7 +107,7 @@ def test_metric_collection(
         ps_metrics = env.pageserver.http_client().get_metrics()
         total = 0.0
         for sample in ps_metrics.query_all(
-            name="pageserver_remote_operation_seconds_count",
+            name="pageserver_remote_timeline_client_seconds_global_count",
             filter={
                 "file_kind": str(file_kind),
                 "op_kind": str(op_kind),

From 2841f1ffa5555601d34a89aaab38abbc99071a8e Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Fri, 4 Apr 2025 14:21:23 -0400
Subject: [PATCH 056/140] removal of `pg_embedding` (#11440)

## Problem

The `pg_embedding` extension has been deprecated and can cause issues
with recent changes such as with
https://github.com/neondatabase/neon/issues/10973

Issue: `PG:2025-04-03 15:39:25.498 GMT
ttid=a4de5bee50225424b053dc64bac96d87/d6f3891b8f968458b3f7edea58fb3c6f
sqlstate=58P01 [15526] ERROR: could not load library
"/usr/local/lib/embedding.so": /usr/local/lib/embedding.so: undefined
symbol: SetLastWrittenLSNForRelation`

## Summary of changes

Removed `pg_embedding` extension from the compute image.
---
 compute/compute-node.Dockerfile | 35 ---------------------------------
 1 file changed, 35 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index e3732e1ed2..83cbacf034 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1022,39 +1022,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
 
-#########################################################################################
-#
-# Layer "pg_embedding-build"
-# compile pg_embedding extension
-#
-#########################################################################################
-FROM build-deps AS pg_embedding-src
-ARG PG_VERSION
-
-# This is our extension, support stopped in favor of pgvector
-# TODO: deprecate it
-WORKDIR /ext-src
-RUN case "${PG_VERSION:?}" in \
-      "v14" | "v15") \
-        export PG_EMBEDDING_VERSION=0.3.5 \
-        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
-        ;; \
-      *) \
-        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
-    esac && \
-    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
-    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C .
-
-FROM pg-build AS pg_embedding-build
-COPY --from=pg_embedding-src /ext-src/ /ext-src/
-WORKDIR /ext-src/
-RUN  if [ -d pg_embedding-src ]; then \
-        cd pg_embedding-src && \
-        make -j $(getconf _NPROCESSORS_ONLN) && \
-        make -j $(getconf _NPROCESSORS_ONLN) install; \
-    fi
-
 #########################################################################################
 #
 # Layer "pg build with nonroot user and cargo installed"
@@ -1647,7 +1614,6 @@ COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1824,7 +1790,6 @@ COPY --from=pg_cron-src /ext-src/ /ext-src/
 COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
 COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
 COPY --from=pg_semver-src /ext-src/ /ext-src/
-#COPY --from=pg_embedding-src /ext-src/ /ext-src/
 #COPY --from=wal2json-src /ext-src/ /ext-src/
 COPY --from=pg_ivm-src /ext-src/ /ext-src/
 COPY --from=pg_partman-src /ext-src/ /ext-src/

From 417b2781d9fff7b8dbfda7e1596ef1a93096eede Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 5 Apr 2025 15:00:51 +0200
Subject: [PATCH 057/140] build(deps): bump openssl from 0.10.70 to 0.10.72 in
 /test_runner/pg_clients/rust/tokio-postgres in the cargo group across 1
 directory (#11455)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 test_runner/pg_clients/rust/tokio-postgres/Cargo.lock | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 0b138bf167..027be03707 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "addr2line"
@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.70"
+version = "0.10.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
+checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.105"
+version = "0.9.107"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
+checksum = "8288979acd84749c744a9014b4382d42b8f7b2592847b5afb2ed29e5d16ede07"
 dependencies = [
  "cc",
  "libc",

From 1a87975d956a8ad17ec8b85da32a137ec4893fcc Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 6 Apr 2025 18:34:13 +0300
Subject: [PATCH 058/140] Misc cleanup of #includes and comments in the neon
 extension (#11456)

Remove useless and often wrong IDENTIFICATION comments. PostgreSQL
sources have them, mostly for historical reasons, but there's no need
for us to copy that style.

Remove unnecessary #includes in header files, putting the #includes
directly in the .c files that need them. The principle is that a header
file should #include other header files if they need definitions from
them, such that each header file can be compiled on its own, but not
other #includes. (There are tools to enforce that, but this was just a
manual clean up of violations that I happened to spot.)
---
 pgxn/neon/bitmap.h                      |  2 +-
 pgxn/neon/control_plane_connector.c     |  3 ---
 pgxn/neon/extension_server.c            |  3 ---
 pgxn/neon/extension_server.h            |  3 ---
 pgxn/neon/file_cache.c                  |  6 +-----
 pgxn/neon/libpagestore.c                |  5 +----
 pgxn/neon/logical_replication_monitor.c |  4 ++--
 pgxn/neon/neon.c                        |  5 +----
 pgxn/neon/neon.h                        | 12 +++++-------
 pgxn/neon/neon_perf_counters.h          |  2 +-
 pgxn/neon/neon_walreader.c              |  1 +
 pgxn/neon/pagestore_client.h            |  9 +++------
 pgxn/neon/pagestore_smgr.c              |  5 +----
 pgxn/neon/relsize_cache.c               |  4 ----
 pgxn/neon/walproposer_compat.c          |  1 +
 15 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h
index 0a131816ef..21efd13547 100644
--- a/pgxn/neon/bitmap.h
+++ b/pgxn/neon/bitmap.h
@@ -9,4 +9,4 @@
 #define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
 #define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
 
-#endif //NEON_BITMAP_H
+#endif							/* NEON_BITMAP_H */
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 59096a1bc8..47ed37da06 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -13,9 +13,6 @@
  *        accumulate changes. On subtransaction commit, the top of the stack
  *        is merged with the table below it.
  *
- * IDENTIFICATION
- *	 contrib/neon/control_plane_connector.c
- *
  *-------------------------------------------------------------------------
  */
 
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 0331f961b4..00dcb6920e 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -3,9 +3,6 @@
  * extension_server.c
  *	  Request compute_ctl to download extension files.
  *
- * IDENTIFICATION
- *	 contrib/neon/extension_server.c
- *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h
index 3e67708b85..8356d70959 100644
--- a/pgxn/neon/extension_server.h
+++ b/pgxn/neon/extension_server.h
@@ -3,9 +3,6 @@
  * extension_server.h
  *	  Request compute_ctl to download extension files.
  *
- * IDENTIFICATION
- *	 contrib/neon/extension_server.h
- *
  *-------------------------------------------------------------------------
  */
 
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 91f5eb272a..97a4c39e49 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1,4 +1,4 @@
-/*
+/*-------------------------------------------------------------------------
  *
  * file_cache.c
  *
@@ -6,10 +6,6 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *
- * IDENTIFICATION
- *	  pgxn/neon/file_cache.c
- *
  *-------------------------------------------------------------------------
  */
 
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 11ef9af36b..60b2249461 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -6,10 +6,6 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *
- * IDENTIFICATION
- *	 contrib/neon/libpqpagestore.c
- *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -34,6 +30,7 @@
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 
 #include "neon.h"
 #include "neon_perf_counters.h"
diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c
index b94faafdfa..69426c2e83 100644
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -1,11 +1,11 @@
+#include "postgres.h"
+
 #include <dirent.h>
 #include <limits.h>
 #include <string.h>
 #include <signal.h>
 #include <sys/stat.h>
 
-#include "postgres.h"
-
 #include "miscadmin.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b738b5ebd1..081025e2d5 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -1,10 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * neon.c
- *	  Utility functions to expose neon specific information to user
- *
- * IDENTIFICATION
- *	 contrib/neon/neon.c
+ *	  Main entry point into the neon exension
  *
  *-------------------------------------------------------------------------
  */
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index c9beb8c318..e2fa136e37 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -3,15 +3,13 @@
  * neon.h
  *	  Functions used in the initialization of this extension.
  *
- * IDENTIFICATION
- *	 contrib/neon/neon.h
- *
  *-------------------------------------------------------------------------
  */
 
 #ifndef NEON_H
 #define NEON_H
-#include "access/xlogreader.h"
+
+#include "access/xlogdefs.h"
 #include "utils/wait_event.h"
 
 /* GUCs */
@@ -58,8 +56,8 @@ extern void SetNeonCurrentClusterSize(uint64 size);
 extern uint64 GetNeonCurrentClusterSize(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
-extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
-extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
-PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
+extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]);
+extern PGDLLEXPORT void WalProposerMain(Datum main_arg);
+extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
 
 #endif							/* NEON_H */
diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h
index 8edc658a30..5f5330bb69 100644
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -12,8 +12,8 @@
 #include "storage/procnumber.h"
 #else
 #include "storage/backendid.h"
-#include "storage/proc.h"
 #endif
+#include "storage/proc.h"
 
 static const uint64 io_wait_bucket_thresholds[] = {
 	       2,        3,        6,        10,  /* 0 us   - 10 us */
diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index 5854a7ef0f..be2c4ddf79 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -20,6 +20,7 @@
 #include "access/xlogreader.h"
 #include "libpq/pqformat.h"
 #include "storage/fd.h"
+#include "utils/memutils.h"
 #include "utils/wait_event.h"
 
 #include "libpq-fe.h"
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 68f7430343..d90dad6cc5 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -8,8 +8,8 @@
  *
  *-------------------------------------------------------------------------
  */
-#ifndef pageserver_h
-#define pageserver_h
+#ifndef PAGESTORE_CLIENT_h
+#define PAGESTORE_CLIENT_h
 
 #include "neon_pgversioncompat.h"
 
@@ -17,11 +17,8 @@
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
 #include "lib/stringinfo.h"
-#include "libpq/pqformat.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"
-#include "storage/smgr.h"
-#include "utils/memutils.h"
 
 #define MAX_SHARDS 128
 #define MAX_PAGESERVER_CONNSTRING_SIZE 256
@@ -326,4 +323,4 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
 }
 
-#endif
+#endif							/* PAGESTORE_CLIENT_H */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 0baa23cc30..0ed00061cb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -37,10 +37,6 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *
- * IDENTIFICATION
- *	  contrib/neon/pagestore_smgr.c
- *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -55,6 +51,7 @@
 #include "catalog/pg_class.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
+#include "libpq/pqformat.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/interrupt.h"
diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
index 2a4c2dc799..60ca1675d9 100644
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -6,10 +6,6 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *
- * IDENTIFICATION
- *	  contrib/neon/relsize_cache.c
- *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c
index a986160224..b9460feb21 100644
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -7,6 +7,7 @@
 
 #include <stdio.h>
 
+#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "utils/datetime.h"
 #include "walproposer.h"

From ad9655bb0113c4c7ccbc9ab9c33bb5797f935ec7 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Sun, 6 Apr 2025 21:30:21 +0200
Subject: [PATCH 059/140] Fix the errors in pg_regress test running on the
 staging. (#11432)

## Problem
The shared libraries preloaded by default interfered with the
`pg_regress` tests on staging, causing wrong results
## Summary of changes
The projects used for these tests are now free from unnecessary
extensions. Some changes were made in patches.
---
 compute/patches/cloud_regress_pg16.patch | 81 ++++++++----------------
 compute/patches/cloud_regress_pg17.patch | 55 ++++++++--------
 2 files changed, 53 insertions(+), 83 deletions(-)

diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch
index 3f0bb84ae7..ae415a5412 100644
--- a/compute/patches/cloud_regress_pg16.patch
+++ b/compute/patches/cloud_regress_pg16.patch
@@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644
  COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
  ERROR:  must be owner of relation constraint_comments_tbl
 diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
-index 442e7aff2b..525f732b03 100644
+index d785f92561..16377e5ac9 100644
 --- a/src/test/regress/expected/conversion.out
 +++ b/src/test/regress/expected/conversion.out
-@@ -8,7 +8,7 @@
+@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup();
  CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
      AS :'regresslib', 'test_enc_conversion'
      LANGUAGE C STRICT;
@@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644
  SELECT *
     INTO TABLE ramp
 diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out
-index 454db91ec0..01378d7081 100644
+index 4cbdbdf84d..573362850e 100644
 --- a/src/test/regress/expected/database.out
 +++ b/src/test/regress/expected/database.out
-@@ -1,8 +1,7 @@
+@@ -1,8 +1,6 @@
  CREATE DATABASE regression_tbd
  	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
  ALTER DATABASE regression_tbd RENAME TO regression_utf8;
 -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
 -ALTER DATABASE regression_utf8 RESET TABLESPACE;
-+WARNING:  you need to manually restart any running background workers after this command
  ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
  -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
  BEGIN;
@@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644
  COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
  CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
 diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
-index 6b8c2f2414..8e13b7fa46 100644
+index 84745b9f60..4883c12351 100644
 --- a/src/test/regress/expected/foreign_key.out
 +++ b/src/test/regress/expected/foreign_key.out
 @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -1112,7 +1111,7 @@ index 8475231735..0653946337 100644
  DROP ROLE regress_passwd_sha_len1;
  DROP ROLE regress_passwd_sha_len2;
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
-index 5b9dba7b32..cc408dad42 100644
+index 620fbe8c52..0570102357 100644
 --- a/src/test/regress/expected/privileges.out
 +++ b/src/test/regress/expected/privileges.out
 @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
@@ -1174,8 +1173,8 @@ index 5b9dba7b32..cc408dad42 100644
 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
  ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
  GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
- SET SESSION AUTHORIZATION regress_priv_user1;
-@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+ SET SESSION AUTHORIZATION regress_priv_user3;
+@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
  ERROR:  permission denied to grant privileges as role "regress_priv_role"
  DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
  GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
@@ -1192,7 +1191,7 @@ index 5b9dba7b32..cc408dad42 100644
  DROP ROLE regress_priv_role;
  SET SESSION AUTHORIZATION regress_priv_user1;
  SELECT session_user, current_user;
-@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
  
  -- security-restricted operations
  \c -
@@ -1201,7 +1200,7 @@ index 5b9dba7b32..cc408dad42 100644
  -- Check that index expressions and predicates are run as the table's owner
  -- A dummy index function checking current_user
  CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
-@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer)
+@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer)
  drop cascades to function testns.priv_testproc(integer)
  -- Change owner of the schema & and rename of new schema owner
  \c -
@@ -1212,7 +1211,7 @@ index 5b9dba7b32..cc408dad42 100644
  SET SESSION ROLE regress_schemauser1;
  CREATE SCHEMA testns;
  SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
-@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7;
+@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7;
  DROP USER regress_priv_user8; -- does not exist
  ERROR:  role "regress_priv_user8" does not exist
  -- permissions with LOCK TABLE
@@ -1221,7 +1220,7 @@ index 5b9dba7b32..cc408dad42 100644
  CREATE TABLE lock_table (a int);
  -- LOCK TABLE and SELECT permission
  GRANT SELECT ON lock_table TO regress_locktable_user;
-@@ -2874,7 +2878,7 @@ DROP USER regress_locktable_user;
+@@ -2881,7 +2885,7 @@ DROP USER regress_locktable_user;
  -- pg_backend_memory_contexts.
  -- switch to superuser
  \c -
@@ -1230,7 +1229,7 @@ index 5b9dba7b32..cc408dad42 100644
  SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
   has_table_privilege 
  ---------------------
-@@ -2918,10 +2922,10 @@ RESET ROLE;
+@@ -2925,10 +2929,10 @@ RESET ROLE;
  -- clean up
  DROP ROLE regress_readallstats;
  -- test role grantor machinery
@@ -1245,7 +1244,7 @@ index 5b9dba7b32..cc408dad42 100644
  GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
  GRANT regress_group_direct_manager TO regress_group_indirect_manager;
  SET SESSION AUTHORIZATION regress_group_direct_manager;
-@@ -2950,9 +2954,9 @@ DROP ROLE regress_group_direct_manager;
+@@ -2957,9 +2961,9 @@ DROP ROLE regress_group_direct_manager;
  DROP ROLE regress_group_indirect_manager;
  DROP ROLE regress_group_member;
  -- test SET and INHERIT options with object ownership changes
@@ -1841,7 +1840,7 @@ index 09a255649b..15895f0c53 100644
  CREATE TABLE ruletest_t2 (x int);
  CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS
 diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out
-index a8e01a6220..5a9cef4ede 100644
+index a8e01a6220..83543b250a 100644
 --- a/src/test/regress/expected/security_label.out
 +++ b/src/test/regress/expected/security_label.out
 @@ -6,8 +6,8 @@ SET client_min_messages TO 'warning';
@@ -1855,34 +1854,6 @@ index a8e01a6220..5a9cef4ede 100644
  CREATE TABLE seclabel_tbl1 (a int, b text);
  CREATE TABLE seclabel_tbl2 (x int, y text);
  CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2;
-@@ -19,21 +19,21 @@ ALTER TABLE seclabel_tbl2 OWNER TO regress_seclabel_user2;
- -- Test of SECURITY LABEL statement without a plugin
- --
- SECURITY LABEL ON TABLE seclabel_tbl1 IS 'classified';			-- fail
--ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL FOR 'dummy' ON TABLE seclabel_tbl1 IS 'classified';		-- fail
- ERROR:  security label provider "dummy" is not loaded
- SECURITY LABEL ON TABLE seclabel_tbl1 IS '...invalid label...';		-- fail
--ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL ON TABLE seclabel_tbl3 IS 'unclassified';			-- fail
--ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL ON ROLE regress_seclabel_user1 IS 'classified';			-- fail
--ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL FOR 'dummy' ON ROLE regress_seclabel_user1 IS 'classified';		-- fail
- ERROR:  security label provider "dummy" is not loaded
- SECURITY LABEL ON ROLE regress_seclabel_user1 IS '...invalid label...';		-- fail
--ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL ON ROLE regress_seclabel_user3 IS 'unclassified';			-- fail
--ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- -- clean up objects
- DROP FUNCTION seclabel_four();
- DROP DOMAIN seclabel_domain;
 diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out
 index b79fe9a1c0..e29fab88ab 100644
 --- a/src/test/regress/expected/select_into.out
@@ -2413,10 +2384,10 @@ index e3e3bea709..fa86ddc326 100644
  COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
  COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment';
 diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
-index 9a65fca91f..58431a3056 100644
+index b567a1a572..4d1ac2e631 100644
 --- a/src/test/regress/sql/conversion.sql
 +++ b/src/test/regress/sql/conversion.sql
-@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
+@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
      AS :'regresslib', 'test_enc_conversion'
      LANGUAGE C STRICT;
  
@@ -2780,7 +2751,7 @@ index ae6841308b..47bc792e30 100644
  
  SELECT *
 diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql
-index 0367c0e37a..a23b98c4bd 100644
+index 46ad263478..eb05584ed5 100644
 --- a/src/test/regress/sql/database.sql
 +++ b/src/test/regress/sql/database.sql
 @@ -1,8 +1,6 @@
@@ -2893,7 +2864,7 @@ index aa147b14a9..370e0dd570 100644
  CREATE FOREIGN DATA WRAPPER dummy;
  COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
-index 45c7a534cb..32dd26b8cd 100644
+index 9f4210b26e..620d3fc87e 100644
 --- a/src/test/regress/sql/foreign_key.sql
 +++ b/src/test/regress/sql/foreign_key.sql
 @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -3246,7 +3217,7 @@ index 53e86b0b6c..0303fdfe96 100644
  -- Check that the invalid secrets were re-hashed. A re-hashed secret
  -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
-index 249df17a58..b258e7f26a 100644
+index 259f1aedd1..6e1a3d17b7 100644
 --- a/src/test/regress/sql/privileges.sql
 +++ b/src/test/regress/sql/privileges.sql
 @@ -24,18 +24,18 @@ RESET client_min_messages;
@@ -3308,7 +3279,7 @@ index 249df17a58..b258e7f26a 100644
  
  ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
  
-@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
  
  -- security-restricted operations
  \c -
@@ -3317,7 +3288,7 @@ index 249df17a58..b258e7f26a 100644
  
  -- Check that index expressions and predicates are run as the table's owner
  
-@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE;
+@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE;
  -- Change owner of the schema & and rename of new schema owner
  \c -
  
@@ -3328,7 +3299,7 @@ index 249df17a58..b258e7f26a 100644
  
  SET SESSION ROLE regress_schemauser1;
  CREATE SCHEMA testns;
-@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist
+@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist
  
  
  -- permissions with LOCK TABLE
@@ -3337,7 +3308,7 @@ index 249df17a58..b258e7f26a 100644
  CREATE TABLE lock_table (a int);
  
  -- LOCK TABLE and SELECT permission
-@@ -1836,7 +1836,7 @@ DROP USER regress_locktable_user;
+@@ -1839,7 +1839,7 @@ DROP USER regress_locktable_user;
  -- switch to superuser
  \c -
  
@@ -3346,7 +3317,7 @@ index 249df17a58..b258e7f26a 100644
  
  SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
-@@ -1856,10 +1856,10 @@ RESET ROLE;
+@@ -1859,10 +1859,10 @@ RESET ROLE;
  DROP ROLE regress_readallstats;
  
  -- test role grantor machinery
@@ -3361,7 +3332,7 @@ index 249df17a58..b258e7f26a 100644
  
  GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
  GRANT regress_group_direct_manager TO regress_group_indirect_manager;
-@@ -1881,9 +1881,9 @@ DROP ROLE regress_group_indirect_manager;
+@@ -1884,9 +1884,9 @@ DROP ROLE regress_group_indirect_manager;
  DROP ROLE regress_group_member;
  
  -- test SET and INHERIT options with object ownership changes
diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch
index e57447a2c6..4f10f8563a 100644
--- a/compute/patches/cloud_regress_pg17.patch
+++ b/compute/patches/cloud_regress_pg17.patch
@@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644
  COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
  ERROR:  must be owner of relation constraint_comments_tbl
 diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
-index 442e7aff2b..525f732b03 100644
+index d785f92561..16377e5ac9 100644
 --- a/src/test/regress/expected/conversion.out
 +++ b/src/test/regress/expected/conversion.out
-@@ -8,7 +8,7 @@
+@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup();
  CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
      AS :'regresslib', 'test_enc_conversion'
      LANGUAGE C STRICT;
@@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644
  SELECT *
     INTO TABLE ramp
 diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out
-index 454db91ec0..01378d7081 100644
+index 4cbdbdf84d..573362850e 100644
 --- a/src/test/regress/expected/database.out
 +++ b/src/test/regress/expected/database.out
-@@ -1,8 +1,7 @@
+@@ -1,8 +1,6 @@
  CREATE DATABASE regression_tbd
  	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
  ALTER DATABASE regression_tbd RENAME TO regression_utf8;
 -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
 -ALTER DATABASE regression_utf8 RESET TABLESPACE;
-+WARNING:  you need to manually restart any running background workers after this command
  ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
  -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
  BEGIN;
@@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644
  COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
  CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
 diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
-index 69994c98e3..129abcfbe8 100644
+index fe6a1015f2..614b387b7d 100644
 --- a/src/test/regress/expected/foreign_key.out
 +++ b/src/test/regress/expected/foreign_key.out
 @@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -1147,7 +1146,7 @@ index 924d6e001d..7fdda73439 100644
  DROP ROLE regress_passwd_sha_len1;
  DROP ROLE regress_passwd_sha_len2;
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
-index 1296da0d57..f43fffa44c 100644
+index e8c668e0a1..03be5c2120 100644
 --- a/src/test/regress/expected/privileges.out
 +++ b/src/test/regress/expected/privileges.out
 @@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
@@ -1209,8 +1208,8 @@ index 1296da0d57..f43fffa44c 100644
 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
  ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
  GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
- SET SESSION AUTHORIZATION regress_priv_user1;
-@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+ SET SESSION AUTHORIZATION regress_priv_user3;
+@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
  ERROR:  permission denied to grant privileges as role "regress_priv_role"
  DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
  GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
@@ -1227,7 +1226,7 @@ index 1296da0d57..f43fffa44c 100644
  DROP ROLE regress_priv_role;
  SET SESSION AUTHORIZATION regress_priv_user1;
  SELECT session_user, current_user;
-@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
  
  -- security-restricted operations
  \c -
@@ -1236,7 +1235,7 @@ index 1296da0d57..f43fffa44c 100644
  -- Check that index expressions and predicates are run as the table's owner
  -- A dummy index function checking current_user
  CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
-@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer)
+@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer)
  drop cascades to function testns.priv_testproc(integer)
  -- Change owner of the schema & and rename of new schema owner
  \c -
@@ -1247,7 +1246,7 @@ index 1296da0d57..f43fffa44c 100644
  SET SESSION ROLE regress_schemauser1;
  CREATE SCHEMA testns;
  SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
-@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7;
+@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7;
  DROP USER regress_priv_user8; -- does not exist
  ERROR:  role "regress_priv_user8" does not exist
  -- permissions with LOCK TABLE
@@ -1256,7 +1255,7 @@ index 1296da0d57..f43fffa44c 100644
  CREATE TABLE lock_table (a int);
  -- LOCK TABLE and SELECT permission
  GRANT SELECT ON lock_table TO regress_locktable_user;
-@@ -2888,7 +2892,7 @@ DROP USER regress_locktable_user;
+@@ -2895,7 +2899,7 @@ DROP USER regress_locktable_user;
  -- pg_backend_memory_contexts.
  -- switch to superuser
  \c -
@@ -1265,7 +1264,7 @@ index 1296da0d57..f43fffa44c 100644
  SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
   has_table_privilege 
  ---------------------
-@@ -2932,10 +2936,10 @@ RESET ROLE;
+@@ -2939,10 +2943,10 @@ RESET ROLE;
  -- clean up
  DROP ROLE regress_readallstats;
  -- test role grantor machinery
@@ -1280,7 +1279,7 @@ index 1296da0d57..f43fffa44c 100644
  GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
  GRANT regress_group_direct_manager TO regress_group_indirect_manager;
  SET SESSION AUTHORIZATION regress_group_direct_manager;
-@@ -2964,9 +2968,9 @@ DROP ROLE regress_group_direct_manager;
+@@ -2971,9 +2975,9 @@ DROP ROLE regress_group_direct_manager;
  DROP ROLE regress_group_indirect_manager;
  DROP ROLE regress_group_member;
  -- test SET and INHERIT options with object ownership changes
@@ -1293,7 +1292,7 @@ index 1296da0d57..f43fffa44c 100644
  CREATE SCHEMA regress_roleoption;
  GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
  GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
-@@ -2995,9 +2999,9 @@ DROP ROLE regress_roleoption_protagonist;
+@@ -3002,9 +3006,9 @@ DROP ROLE regress_roleoption_protagonist;
  DROP ROLE regress_roleoption_donor;
  DROP ROLE regress_roleoption_recipient;
  -- MAINTAIN
@@ -2433,10 +2432,10 @@ index e3e3bea709..fa86ddc326 100644
  COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
  COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment';
 diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
-index 9a65fca91f..58431a3056 100644
+index b567a1a572..4d1ac2e631 100644
 --- a/src/test/regress/sql/conversion.sql
 +++ b/src/test/regress/sql/conversion.sql
-@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
+@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
      AS :'regresslib', 'test_enc_conversion'
      LANGUAGE C STRICT;
  
@@ -2800,7 +2799,7 @@ index ae6841308b..47bc792e30 100644
  
  SELECT *
 diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql
-index 0367c0e37a..a23b98c4bd 100644
+index 46ad263478..eb05584ed5 100644
 --- a/src/test/regress/sql/database.sql
 +++ b/src/test/regress/sql/database.sql
 @@ -1,8 +1,6 @@
@@ -2913,7 +2912,7 @@ index aa147b14a9..370e0dd570 100644
  CREATE FOREIGN DATA WRAPPER dummy;
  COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
-index 2e710e419c..89cd481a54 100644
+index 8c4e4c7c83..e946cd2119 100644
 --- a/src/test/regress/sql/foreign_key.sql
 +++ b/src/test/regress/sql/foreign_key.sql
 @@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -3301,7 +3300,7 @@ index bb82aa4aa2..dd8a05e24d 100644
  -- Check that the invalid secrets were re-hashed. A re-hashed secret
  -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
-index 5880bc018d..27aa952b18 100644
+index b7e1cb6cdd..6e5a2217f1 100644
 --- a/src/test/regress/sql/privileges.sql
 +++ b/src/test/regress/sql/privileges.sql
 @@ -24,18 +24,18 @@ RESET client_min_messages;
@@ -3363,7 +3362,7 @@ index 5880bc018d..27aa952b18 100644
  
  ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
  
-@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
  
  -- security-restricted operations
  \c -
@@ -3372,7 +3371,7 @@ index 5880bc018d..27aa952b18 100644
  
  -- Check that index expressions and predicates are run as the table's owner
  
-@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE;
+@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE;
  -- Change owner of the schema & and rename of new schema owner
  \c -
  
@@ -3383,7 +3382,7 @@ index 5880bc018d..27aa952b18 100644
  
  SET SESSION ROLE regress_schemauser1;
  CREATE SCHEMA testns;
-@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist
+@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist
  
  
  -- permissions with LOCK TABLE
@@ -3392,7 +3391,7 @@ index 5880bc018d..27aa952b18 100644
  CREATE TABLE lock_table (a int);
  
  -- LOCK TABLE and SELECT permission
-@@ -1851,7 +1851,7 @@ DROP USER regress_locktable_user;
+@@ -1854,7 +1854,7 @@ DROP USER regress_locktable_user;
  -- switch to superuser
  \c -
  
@@ -3401,7 +3400,7 @@ index 5880bc018d..27aa952b18 100644
  
  SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
-@@ -1871,10 +1871,10 @@ RESET ROLE;
+@@ -1874,10 +1874,10 @@ RESET ROLE;
  DROP ROLE regress_readallstats;
  
  -- test role grantor machinery
@@ -3416,7 +3415,7 @@ index 5880bc018d..27aa952b18 100644
  
  GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
  GRANT regress_group_direct_manager TO regress_group_indirect_manager;
-@@ -1896,9 +1896,9 @@ DROP ROLE regress_group_indirect_manager;
+@@ -1899,9 +1899,9 @@ DROP ROLE regress_group_indirect_manager;
  DROP ROLE regress_group_member;
  
  -- test SET and INHERIT options with object ownership changes
@@ -3429,7 +3428,7 @@ index 5880bc018d..27aa952b18 100644
  CREATE SCHEMA regress_roleoption;
  GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
  GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
-@@ -1926,9 +1926,9 @@ DROP ROLE regress_roleoption_donor;
+@@ -1929,9 +1929,9 @@ DROP ROLE regress_roleoption_donor;
  DROP ROLE regress_roleoption_recipient;
  
  -- MAINTAIN

From b2a670c765f36d50495f3569c1f7511950f553d3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 7 Apr 2025 14:04:36 +0300
Subject: [PATCH 060/140] refactor: Use same prototype for neon_read_at_lsn on
 all PG versions (#11457)

The 'neon_read' function needs to have a different prototype on PG < 16,
because it's part of the smgr interface. But neon_read_at_lsn doesn't
have that restriction.
---
 pgxn/neon/pagestore_client.h    |  5 -----
 pgxn/neon/pagestore_smgr.c      | 10 ----------
 pgxn/neon_test_utils/neontest.c |  5 -----
 3 files changed, 20 deletions(-)

diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index d90dad6cc5..a2e3d57e47 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -274,13 +274,8 @@ typedef struct
 	XLogRecPtr effective_request_lsn;
 } neon_request_lsns;
 
-#if PG_MAJORVERSION_NUM < 16
-extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 neon_request_lsns request_lsns, char *buffer);
-#else
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, void *buffer);
-#endif
 extern int64 neon_dbsize(Oid dbNode);
 
 /* utils for neon relsize cache */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 0ed00061cb..a295304a58 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3154,13 +3154,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
  * The offsets in request_lsns, buffers, and mask are linked.
  */
 static void
-#if PG_MAJORVERSION_NUM < 16
-neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
-				  char **buffers, BlockNumber nblocks, const bits8 *mask)
-#else
 neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
 				  void **buffers, BlockNumber nblocks, const bits8 *mask)
-#endif
 {
 	NeonResponse *resp;
 	uint64		ring_index;
@@ -3356,13 +3351,8 @@ Retry:
  * To avoid breaking tests in the runtime please keep function signature in sync.
  */
 void
-#if PG_MAJORVERSION_NUM < 16
-neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 neon_request_lsns request_lsns, char *buffer)
-#else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 neon_request_lsns request_lsns, void *buffer)
-#endif
 {
 	neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 0b5499ca53..d37412f674 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -50,13 +50,8 @@ PG_FUNCTION_INFO_V1(trigger_segfault);
  * Linkage to functions in neon module.
  * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
  */
-#if PG_MAJORVERSION_NUM < 16
-typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   neon_request_lsns request_lsns, char *buffer);
-#else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 									   neon_request_lsns request_lsns, void *buffer);
-#endif
 
 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
 

From aa88279681df1afb755d8f926f4384846a2f8e91 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 7 Apr 2025 14:23:40 +0200
Subject: [PATCH 061/140] fix(storcon/http): node status API returns serialized
 runtime object (#11461)

The Serialize impl on the `Node` type is for the `/debug` endpoint only.
Committed APIs should use the `NodeDescribeResponse`.

Refs
- fixes https://github.com/neondatabase/neon/issues/11326
- found while working on admin UI change
https://github.com/neondatabase/cloud/pull/26207
---
 storage_controller/src/http.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 0caf6e3766..0d1dc8f8ee 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -899,7 +899,7 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
-    let node_status = state.service.get_node(node_id).await?;
+    let node_status = state.service.get_node(node_id).await?.describe();
 
     json_response(StatusCode::OK, node_status)
 }

From 85a515c1762eec7f7780751912304b96e4626782 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 7 Apr 2025 14:33:56 +0100
Subject: [PATCH 062/140] update tokio for RUSTSEC-2025-0023 (#11464)

---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 03a376cdae..dbbf2c3357 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7116,9 +7116,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.43.0"
+version = "1.43.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
+checksum = "492a604e2fd7f814268a378409e6c92b5525d747d10db9a229723f55a417958c"
 dependencies = [
  "backtrace",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index 3fb9229da8..1f605681db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -183,7 +183,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.41", features = ["macros"] }
+tokio = { version = "1.43.1", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"

From 8eb701d706e80ce127d5baec93836ce1e84d87aa Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 7 Apr 2025 16:56:55 +0300
Subject: [PATCH 063/140] Save FSM/VM pages on normal shutdown (#11449)

## Problem

See https://neondb.slack.com/archives/C03QLRH7PPD/p1743746717119179

We wallow FSM/VM pages when they are written to disk to persist them in
PS.
But it is not happen during shutdown checkpoint, because writing to WAL
during checkpoint cause Postgres panic.

## Summary of changes

Move `CheckPointBuffers` call to `PreCheckPointGuts`

Postgres PRs:
https://github.com/neondatabase/postgres/pull/615
https://github.com/neondatabase/postgres/pull/614
https://github.com/neondatabase/postgres/pull/613
https://github.com/neondatabase/postgres/pull/612

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 1 -
 vendor/postgres-v14        | 2 +-
 vendor/postgres-v15        | 2 +-
 vendor/postgres-v16        | 2 +-
 vendor/postgres-v17        | 2 +-
 vendor/revisions.json      | 8 ++++----
 6 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index a295304a58..eb8df11923 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1900,7 +1900,6 @@ neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		log_pages = true;
 	}
 	else if (XLogInsertAllowed() &&
-			 !ShutdownRequestPending &&
 			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
 	{
 		log_pages = true;
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 8cca70c22e..a0391901a2 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 8cca70c22e2894dd4645f9a940086ac437b0a11b
+Subproject commit a0391901a2af13aa029b905272a5b2024133c926
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 23708b3aca..aeb292eeac 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 23708b3aca9adf163aa0973eb63d9afc0e4a04c3
+Subproject commit aeb292eeace9072e07071254b6ffc7a74007d4d2
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 746bd9ffe5..d56e79cd5d 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 746bd9ffe5c29bce030eaea1031054057f3c5d45
+Subproject commit d56e79cd5d6136c159b1d8d98acb7981d4b69364
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index c9e4ff5a38..66114c23bc 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit c9e4ff5a38907acd71107634055bf2609aba43a5
+Subproject commit 66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 2abfbffccb..d7eddf42b7 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.4",
-    "c9e4ff5a38907acd71107634055bf2609aba43a5"
+    "66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8"
   ],
   "v16": [
     "16.8",
-    "746bd9ffe5c29bce030eaea1031054057f3c5d45"
+    "d56e79cd5d6136c159b1d8d98acb7981d4b69364"
   ],
   "v15": [
     "15.12",
-    "23708b3aca9adf163aa0973eb63d9afc0e4a04c3"
+    "aeb292eeace9072e07071254b6ffc7a74007d4d2"
   ],
   "v14": [
     "14.17",
-    "8cca70c22e2894dd4645f9a940086ac437b0a11b"
+    "a0391901a2af13aa029b905272a5b2024133c926"
   ]
 }

From d37e90f43043032a0705472f1eda88c4ebb01147 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 7 Apr 2025 12:01:21 -0400
Subject: [PATCH 064/140] fix(pageserver): allow shard ancestor compaction to
 be cancelled (#11452)

## Problem

https://github.com/neondatabase/neon/issues/11330
https://github.com/neondatabase/neon/issues/11358

## Summary of changes

Looking at the staging log, a few tenants right after shard split are
stuck on shutdown because they are running shard ancestor compaction.
The compaction does not respect the cancellation token.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5aaef8db0c..73f6691f14 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1244,6 +1244,10 @@ impl Timeline {
         let mut replace_image_layers = Vec::new();
 
         for layer in layers_to_rewrite {
+            if self.cancel.is_cancelled() {
+                return Err(CompactionError::ShuttingDown);
+            }
+
             tracing::info!(layer=%layer, "Rewriting layer after shard split...");
             let mut image_layer_writer = ImageLayerWriter::new(
                 self.conf,

From 486872dd28d538817599f29b045be025d1e3f43a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 7 Apr 2025 18:12:04 +0200
Subject: [PATCH 065/140] Add support to specify auth token via
 --auth-token-path (#11443)

Before we specified the JWT via `SAFEKEEPER_AUTH_TOKEN`, but env vars
are quite public, both in procfs as well as the unit files. So add a way
to put the auth token into a file directly.

context: https://neondb.slack.com/archives/C033RQ5SPDH/p1743692566311099
---
 safekeeper/src/bin/safekeeper.rs | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 18aa710916..b8c122ea72 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -223,6 +223,9 @@ struct Args {
     /// Flag to use https for requests to peer's safekeeper API.
     #[arg(long)]
     pub use_https_safekeeper_api: bool,
+    /// Path to the JWT auth token used to authenticate with other safekeepers.
+    #[arg(long)]
+    auth_token_path: Option<Utf8PathBuf>,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -341,14 +344,24 @@ async fn main() -> anyhow::Result<()> {
     };
 
     // Load JWT auth token to connect to other safekeepers for pull_timeline.
+    // First check if the env var is present, then check the arg with the path.
+    // We want to deprecate and remove the env var method in the future.
     let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") {
         Ok(v) => {
             info!("loaded JWT token for authentication with safekeepers");
             Some(SecretString::from(v))
         }
         Err(VarError::NotPresent) => {
-            info!("no JWT token for authentication with safekeepers detected");
-            None
+            if let Some(auth_token_path) = args.auth_token_path.as_ref() {
+                info!(
+                    "loading JWT token for authentication with safekeepers from {auth_token_path}"
+                );
+                let auth_token = tokio::fs::read_to_string(auth_token_path).await?;
+                Some(SecretString::from(auth_token.trim().to_owned()))
+            } else {
+                info!("no JWT token for authentication with safekeepers detected");
+                None
+            }
         }
         Err(_) => {
             warn!("JWT token for authentication with safekeepers is not unicode");

From 8a2b19f4679ddd0e42c5a9b5264d240047e3fb61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 7 Apr 2025 18:52:54 +0200
Subject: [PATCH 066/140] Allow potential warning in
 test_storcon_create_delete_sk_down (#11466)

Since merging #11400 and addition of
`test_storcon_create_delete_sk_down`, we've seen an error occur multiple
times.

https://github.com/neondatabase/neon/pull/11400#issuecomment-2782528369
---
 test_runner/regress/test_storage_controller.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 097c187699..702f4eeccf 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4109,6 +4109,7 @@ def test_storcon_create_delete_sk_down(neon_env_builder: NeonEnvBuilder, restart
     env.storage_controller.allowed_errors.extend(
         [
             ".*Call to safekeeper.* management API still failed after.*",
+            ".*Call to safekeeper.* management API failed, will retry.*",
             ".*reconcile_one.*tenant_id={tenant_id}.*Call to safekeeper.* management API still failed after.*",
         ]
     )

From 26c5c7e9422e01085a7f8670183ffc853b8fa8e9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 7 Apr 2025 19:56:56 +0200
Subject: [PATCH 067/140] pageserver: set `Stopping` state on attach
 cancellation (#11462)

## Problem

If a tenant is cancelled (e.g. due to Pageserver shutdown) during
attach, it is set to `Broken`. This results both in error log spam and
500 responses for clients -- shutdown is supposed to return 503
responses which can be retried.

This becomes more likely to happen with #11328, where we perform tenant
manifest downloads/uploads during attach.

## Summary of changes

Set tenant state to `Stopping` when attach fails and the tenant is
cancelled, downgrading the log messages to INFO. This introduces two
variants of `Stopping` -- with and without a caller barrier -- where the
latter is used to signal attach cancellation.
---
 libs/pageserver_api/src/models.rs |  21 +++++-
 pageserver/src/tenant.rs          | 108 +++++++++++++-----------------
 2 files changed, 67 insertions(+), 62 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index bdee46f1b1..2ffff67688 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -80,10 +80,22 @@ pub enum TenantState {
     ///
     /// Transitions out of this state are possible through `set_broken()`.
     Stopping {
+        /// The barrier can be used to wait for shutdown to complete. The first caller to set
+        /// Some(Barrier) is responsible for driving shutdown to completion. Subsequent callers
+        /// will wait for the first caller's existing barrier.
+        ///
+        /// None is set when an attach is cancelled, to signal to shutdown that the attach has in
+        /// fact cancelled:
+        ///
+        /// 1. `shutdown` sees `TenantState::Attaching`, and cancels the tenant.
+        /// 2. `attach` sets `TenantState::Stopping(None)` and exits.
+        /// 3. `set_stopping` waits for `TenantState::Stopping(None)` and sets
+        ///    `TenantState::Stopping(Some)` to claim the barrier as the shutdown owner.
+        //
         // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
         // otherwise it will not be skipped during deserialization
         #[serde(skip)]
-        progress: completion::Barrier,
+        progress: Option<completion::Barrier>,
     },
     /// The tenant is recognized by the pageserver, but can no longer be used for
     /// any operations.
@@ -2719,10 +2731,15 @@ mod tests {
                 "Activating",
             ),
             (line!(), TenantState::Active, "Active"),
+            (
+                line!(),
+                TenantState::Stopping { progress: None },
+                "Stopping",
+            ),
             (
                 line!(),
                 TenantState::Stopping {
-                    progress: utils::completion::Barrier::default(),
+                    progress: Some(completion::Barrier::default()),
                 },
                 "Stopping",
             ),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 441597d77f..def15e35c0 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1354,36 +1354,41 @@ impl Tenant {
                     }
                 }
 
-                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
-                enum BrokenVerbosity {
-                    Error,
-                    Info
-                }
-                let make_broken =
-                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
-                        match verbosity {
-                            BrokenVerbosity::Info => {
-                                info!("attach cancelled, setting tenant state to Broken: {err}");
-                            },
-                            BrokenVerbosity::Error => {
-                                error!("attach failed, setting tenant state to Broken: {err:?}");
-                            }
+                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
+                    t.state.send_modify(|state| match state {
+                        // TODO: the old code alluded to DeleteTenantFlow sometimes setting
+                        // TenantState::Stopping before we get here, but this may be outdated.
+                        // Let's find out with a testing assertion. If this doesn't fire, and the
+                        // logs don't show this happening in production, remove the Stopping cases.
+                        TenantState::Stopping{..} if cfg!(any(test, feature = "testing")) => {
+                            panic!("unexpected TenantState::Stopping during attach")
                         }
-                        t.state.send_modify(|state| {
-                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
-                            // if it errors, we will call make_broken when tenant is already in Stopping.
-                            assert!(
-                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                                "the attach task owns the tenant state until activation is complete"
-                            );
-
-                            *state = TenantState::broken_from_reason(err.to_string());
-                        });
-                    };
+                        // If the tenant is cancelled, assume the error was caused by cancellation.
+                        TenantState::Attaching if t.cancel.is_cancelled() => {
+                            info!("attach cancelled, setting tenant state to Stopping: {err}");
+                            // NB: progress None tells `set_stopping` that attach has cancelled.
+                            *state = TenantState::Stopping { progress: None };
+                        }
+                        // According to the old code, DeleteTenantFlow may already have set this to
+                        // Stopping. Retain its progress.
+                        // TODO: there is no DeleteTenantFlow. Is this still needed? See above.
+                        TenantState::Stopping { progress } if t.cancel.is_cancelled() => {
+                            assert!(progress.is_some(), "concurrent attach cancellation");
+                            info!("attach cancelled, already Stopping: {err}");
+                        }
+                        // Mark the tenant as broken.
+                        TenantState::Attaching | TenantState::Stopping { .. } => {
+                            error!("attach failed, setting tenant state to Broken (was {state}): {err:?}");
+                            *state = TenantState::broken_from_reason(err.to_string())
+                        }
+                        // The attach task owns the tenant state until activated.
+                        state => panic!("invalid tenant state {state} during attach: {err:?}"),
+                    });
+                }
 
                 // TODO: should also be rejecting tenant conf changes that violate this check.
                 if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
-                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                    make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
                     return Ok(());
                 }
 
@@ -1435,10 +1440,8 @@ impl Tenant {
                             // stayed in Activating for such a long time that shutdown found it in
                             // that state.
                             tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
-                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
-                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
-                            // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
+                            // Set the tenant to Stopping to signal `set_stopping` that we're done.
+                            make_broken_or_stopping(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                             return Ok(());
                         },
                     )
@@ -1457,7 +1460,7 @@ impl Tenant {
                         match res {
                             Ok(p) => Some(p),
                             Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                                make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
                                 return Ok(());
                             }
                         }
@@ -1483,9 +1486,7 @@ impl Tenant {
                         info!("attach finished, activating");
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
-                    Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
-                    }
+                    Err(e) => make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)),
                 }
 
                 // If we are doing an opportunistic warmup attachment at startup, initialize
@@ -3429,7 +3430,7 @@ impl Tenant {
             shutdown_mode
         };
 
-        match self.set_stopping(shutdown_progress, false, false).await {
+        match self.set_stopping(shutdown_progress).await {
             Ok(()) => {}
             Err(SetStoppingError::Broken) => {
                 // assume that this is acceptable
@@ -3509,25 +3510,13 @@ impl Tenant {
     /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
     ///
     /// This function is not cancel-safe!
-    ///
-    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
-    /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
-    async fn set_stopping(
-        &self,
-        progress: completion::Barrier,
-        _allow_transition_from_loading: bool,
-        allow_transition_from_attaching: bool,
-    ) -> Result<(), SetStoppingError> {
+    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
         let mut rx = self.state.subscribe();
 
         // cannot stop before we're done activating, so wait out until we're done activating
         rx.wait_for(|state| match state {
-            TenantState::Attaching if allow_transition_from_attaching => true,
             TenantState::Activating(_) | TenantState::Attaching => {
-                info!(
-                    "waiting for {} to turn Active|Broken|Stopping",
-                    <&'static str>::from(state)
-                );
+                info!("waiting for {state} to turn Active|Broken|Stopping");
                 false
             }
             TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
@@ -3538,25 +3527,24 @@ impl Tenant {
         // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
         let mut err = None;
         let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) => {
-                unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
-            }
-            TenantState::Attaching => {
-                if !allow_transition_from_attaching {
-                    unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
+            TenantState::Activating(_) | TenantState::Attaching => {
+                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
             }
             TenantState::Active => {
                 // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                 // are created after the transition to Stopping. That's harmless, as the Timelines
                 // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
-                *current_state = TenantState::Stopping { progress };
+                *current_state = TenantState::Stopping { progress: Some(progress) };
                 // Continue stopping outside the closure. We need to grab timelines.lock()
                 // and we plan to turn it into a tokio::sync::Mutex in a future patch.
                 true
             }
+            TenantState::Stopping { progress: None } => {
+                // An attach was cancelled, and the attach transitioned the tenant from Attaching to
+                // Stopping(None) to let us know it exited. Register our progress and continue.
+                *current_state = TenantState::Stopping { progress: Some(progress) };
+                true
+            }
             TenantState::Broken { reason, .. } => {
                 info!(
                     "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
@@ -3564,7 +3552,7 @@ impl Tenant {
                 err = Some(SetStoppingError::Broken);
                 false
             }
-            TenantState::Stopping { progress } => {
+            TenantState::Stopping { progress: Some(progress) } => {
                 info!("Tenant is already in Stopping state");
                 err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
                 false

From 99d8788756b55a9d2b2371a46e5cdd17c9266827 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 7 Apr 2025 21:10:36 +0200
Subject: [PATCH 068/140] pageserver: improve tenant manifest lifecycle
 (#11328)

## Problem

Currently, the tenant manifest is only uploaded if there are offloaded
timelines. The checks are also a bit loose (e.g. only checks number of
offloaded timelines). We want to start using the manifest for other
things too (e.g. stripe size).

Resolves #11271.

## Summary of changes

This patch ensures that a tenant manifest always exists. The lifecycle
is:

* During preload, fetch the existing manifest, if any.
* During attach, upload a tenant manifest if it differs from the
preloaded one (or does not exist).
* Upload a new manifest as needed, if it differs from the last-known
manifest (ignoring version number).
* On splits, pre-populate the manifest from the parent.
* During Pageserver physical GC, remove old manifests but keep the
latest 2 generations.

This will cause nearly all existing tenants to upload a new tenant
manifest on their first attach after this change. Attaches are
concurrency-limited in the storage controller, so we expect this will be
fine.

Also updates `make_broken` to automatically log at `INFO` level when the
tenant has been cancelled, to avoid spurious error logs during shutdown.
---
 pageserver/src/tenant.rs                      | 159 ++++++++++--------
 .../tenant/remote_timeline_client/manifest.rs | 125 ++++++++++++--
 .../tenant/remote_timeline_client/upload.rs   |  11 +-
 pageserver/src/tenant/timeline/delete.rs      |  11 +-
 pageserver/src/tenant/timeline/offload.rs     |   2 +-
 test_runner/regress/test_timeline_archive.py  |   6 +-
 6 files changed, 219 insertions(+), 95 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index def15e35c0..0c399d4c91 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,6 +45,7 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::{
     FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError,
+    download_tenant_manifest,
 };
 use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
 use storage_broker::BrokerClientChannel;
@@ -226,7 +227,8 @@ struct TimelinePreload {
 }
 
 pub(crate) struct TenantPreload {
-    tenant_manifest: TenantManifest,
+    /// The tenant manifest from remote storage, or None if no manifest was found.
+    tenant_manifest: Option<TenantManifest>,
     /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest.
     timelines: HashMap<TimelineId, Option<TimelinePreload>>,
 }
@@ -282,12 +284,15 @@ pub struct Tenant {
     /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
     timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
 
-    /// Serialize writes of the tenant manifest to remote storage.  If there are concurrent operations
-    /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
-    /// each other (this could be optimized to coalesce writes if necessary).
+    /// The last tenant manifest known to be in remote storage. None if the manifest has not yet
+    /// been either downloaded or uploaded. Always Some after tenant attach.
     ///
-    /// The contents of the Mutex are the last manifest we successfully uploaded
-    tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
+    /// Initially populated during tenant attach, updated via `maybe_upload_tenant_manifest`.
+    ///
+    /// Do not modify this directly. It is used to check whether a new manifest needs to be
+    /// uploaded. The manifest is constructed in `build_tenant_manifest`, and uploaded via
+    /// `maybe_upload_tenant_manifest`.
+    remote_tenant_manifest: tokio::sync::Mutex<Option<TenantManifest>>,
 
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
@@ -1526,28 +1531,27 @@ impl Tenant {
             cancel.clone(),
         )
         .await?;
-        let (offloaded_add, tenant_manifest) =
-            match remote_timeline_client::download_tenant_manifest(
-                remote_storage,
-                &self.tenant_shard_id,
-                self.generation,
-                &cancel,
-            )
-            .await
-            {
-                Ok((tenant_manifest, _generation, _manifest_mtime)) => (
-                    format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
-                    tenant_manifest,
-                ),
-                Err(DownloadError::NotFound) => {
-                    ("no manifest".to_string(), TenantManifest::empty())
-                }
-                Err(e) => Err(e)?,
-            };
+
+        let tenant_manifest = match download_tenant_manifest(
+            remote_storage,
+            &self.tenant_shard_id,
+            self.generation,
+            &cancel,
+        )
+        .await
+        {
+            Ok((tenant_manifest, _, _)) => Some(tenant_manifest),
+            Err(DownloadError::NotFound) => None,
+            Err(err) => return Err(err.into()),
+        };
 
         info!(
-            "found {} timelines, and {offloaded_add}",
-            remote_timeline_ids.len()
+            "found {} timelines ({} offloaded timelines)",
+            remote_timeline_ids.len(),
+            tenant_manifest
+                .as_ref()
+                .map(|m| m.offloaded_timelines.len())
+                .unwrap_or(0)
         );
 
         for k in other_keys {
@@ -1556,11 +1560,13 @@ impl Tenant {
 
         // Avoid downloading IndexPart of offloaded timelines.
         let mut offloaded_with_prefix = HashSet::new();
-        for offloaded in tenant_manifest.offloaded_timelines.iter() {
-            if remote_timeline_ids.remove(&offloaded.timeline_id) {
-                offloaded_with_prefix.insert(offloaded.timeline_id);
-            } else {
-                // We'll take care later of timelines in the manifest without a prefix
+        if let Some(tenant_manifest) = &tenant_manifest {
+            for offloaded in tenant_manifest.offloaded_timelines.iter() {
+                if remote_timeline_ids.remove(&offloaded.timeline_id) {
+                    offloaded_with_prefix.insert(offloaded.timeline_id);
+                } else {
+                    // We'll take care later of timelines in the manifest without a prefix
+                }
             }
         }
 
@@ -1634,12 +1640,14 @@ impl Tenant {
 
         let mut offloaded_timeline_ids = HashSet::new();
         let mut offloaded_timelines_list = Vec::new();
-        for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() {
-            let timeline_id = timeline_manifest.timeline_id;
-            let offloaded_timeline =
-                OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
-            offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
-            offloaded_timeline_ids.insert(timeline_id);
+        if let Some(tenant_manifest) = &preload.tenant_manifest {
+            for timeline_manifest in tenant_manifest.offloaded_timelines.iter() {
+                let timeline_id = timeline_manifest.timeline_id;
+                let offloaded_timeline =
+                    OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
+                offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
+                offloaded_timeline_ids.insert(timeline_id);
+            }
         }
         // Complete deletions for offloaded timeline id's from manifest.
         // The manifest will be uploaded later in this function.
@@ -1797,15 +1805,21 @@ impl Tenant {
             .context("resume_deletion")
             .map_err(LoadLocalTimelineError::ResumeDeletion)?;
         }
-        let needs_manifest_upload =
-            offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len();
         {
             let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
             offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
         }
-        if needs_manifest_upload {
-            self.store_tenant_manifest().await?;
+
+        // Stash the preloaded tenant manifest, and upload a new manifest if changed.
+        //
+        // NB: this must happen after the tenant is fully populated above. In particular the
+        // offloaded timelines, which are included in the manifest.
+        {
+            let mut guard = self.remote_tenant_manifest.lock().await;
+            assert!(guard.is_none(), "tenant manifest set before preload"); // first populated here
+            *guard = preload.tenant_manifest;
         }
+        self.maybe_upload_tenant_manifest().await?;
 
         // The local filesystem contents are a cache of what's in the remote IndexPart;
         // IndexPart is the source of truth.
@@ -2219,7 +2233,7 @@ impl Tenant {
         };
 
         // Upload new list of offloaded timelines to S3
-        self.store_tenant_manifest().await?;
+        self.maybe_upload_tenant_manifest().await?;
 
         // Activate the timeline (if it makes sense)
         if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -4053,18 +4067,19 @@ impl Tenant {
 
     /// Generate an up-to-date TenantManifest based on the state of this Tenant.
     fn build_tenant_manifest(&self) -> TenantManifest {
-        let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
-
-        let mut timeline_manifests = timelines_offloaded
-            .iter()
-            .map(|(_timeline_id, offloaded)| offloaded.manifest())
-            .collect::<Vec<_>>();
-        // Sort the manifests so that our output is deterministic
-        timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id);
+        // Collect the offloaded timelines, and sort them for deterministic output.
+        let offloaded_timelines = self
+            .timelines_offloaded
+            .lock()
+            .unwrap()
+            .values()
+            .map(|tli| tli.manifest())
+            .sorted_by_key(|m| m.timeline_id)
+            .collect_vec();
 
         TenantManifest {
             version: LATEST_TENANT_MANIFEST_VERSION,
-            offloaded_timelines: timeline_manifests,
+            offloaded_timelines,
         }
     }
 
@@ -4287,7 +4302,7 @@ impl Tenant {
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
             timelines_offloaded: Mutex::new(HashMap::new()),
-            tenant_manifest_upload: Default::default(),
+            remote_tenant_manifest: Default::default(),
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
@@ -5520,27 +5535,35 @@ impl Tenant {
             .unwrap_or(0)
     }
 
-    /// Serialize and write the latest TenantManifest to remote storage.
-    pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Only one manifest write may be done at at time, and the contents of the manifest
-        // must be loaded while holding this lock. This makes it safe to call this function
-        // from anywhere without worrying about colliding updates.
+    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
+    /// manifest in `Self::remote_tenant_manifest`.
+    ///
+    /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
+    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
+    /// the authoritative source of data with an API that automatically uploads on changes. Revisit
+    /// this when the manifest is more widely used and we have a better idea of the data model.
+    pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
+        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
+        // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
+        // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
+        // simple coalescing mechanism.
         let mut guard = tokio::select! {
-            g = self.tenant_manifest_upload.lock() => {
-                g
-            },
-            _ = self.cancel.cancelled() => {
-                return Err(TenantManifestError::Cancelled);
-            }
+            guard = self.remote_tenant_manifest.lock() => guard,
+            _ = self.cancel.cancelled() => return Err(TenantManifestError::Cancelled),
         };
 
+        // Build a new manifest.
         let manifest = self.build_tenant_manifest();
-        if Some(&manifest) == (*guard).as_ref() {
-            // Optimisation: skip uploads that don't change anything.
-            return Ok(());
+
+        // Check if the manifest has changed. We ignore the version number here, to avoid
+        // uploading every manifest on version number bumps.
+        if let Some(old) = guard.as_ref() {
+            if manifest.eq_ignoring_version(old) {
+                return Ok(());
+            }
         }
 
-        // Remote storage does no retries internally, so wrap it
+        // Upload the manifest. Remote storage does no retries internally, so retry here.
         match backoff::retry(
             || async {
                 upload_tenant_manifest(
@@ -5552,7 +5575,7 @@ impl Tenant {
                 )
                 .await
             },
-            |_e| self.cancel.is_cancelled(),
+            |_| self.cancel.is_cancelled(),
             FAILED_UPLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "uploading tenant manifest",
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index 543ccc219d..0e07acfbc8 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -3,11 +3,15 @@ use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
-/// Tenant-shard scoped manifest
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
+/// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant
+/// shard-wide information that must be persisted in remote storage.
+///
+/// The manifest is always updated on tenant attach, and as needed.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct TenantManifest {
-    /// Debugging aid describing the version of this manifest.
-    /// Can also be used for distinguishing breaking changes later on.
+    /// The manifest version. Incremented on manifest format changes, even non-breaking ones.
+    /// Manifests must generally always be backwards and forwards compatible for one release, to
+    /// allow release rollbacks.
     pub version: usize,
 
     /// The list of offloaded timelines together with enough information
@@ -16,6 +20,7 @@ pub struct TenantManifest {
     /// Note: the timelines mentioned in this list might be deleted, i.e.
     /// we don't hold an invariant that the references aren't dangling.
     /// Existence of index-part.json is the actual indicator of timeline existence.
+    #[serde(default)]
     pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
 }
 
@@ -24,7 +29,7 @@ pub struct TenantManifest {
 /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
 /// but the two datastructures serve different needs, this is for a persistent disk format
 /// that must be backwards compatible, while the other is only for informative purposes.
-#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Copy, PartialEq, Eq)]
 pub struct OffloadedTimelineManifest {
     pub timeline_id: TimelineId,
     /// Whether the timeline has a parent it has been branched off from or not
@@ -35,20 +40,114 @@ pub struct OffloadedTimelineManifest {
     pub archived_at: NaiveDateTime,
 }
 
+/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
+/// do not use deny_unknown_fields, so new fields are not breaking.
 pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
 
 impl TenantManifest {
-    pub(crate) fn empty() -> Self {
-        Self {
-            version: LATEST_TENANT_MANIFEST_VERSION,
-            offloaded_timelines: vec![],
+    /// Returns true if the manifests are equal, ignoring the version number. This avoids
+    /// re-uploading all manifests just because the version number is bumped.
+    pub fn eq_ignoring_version(&self, other: &Self) -> bool {
+        // Fast path: if the version is equal, just compare directly.
+        if self.version == other.version {
+            return self == other;
         }
-    }
-    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
-        serde_json::from_slice::<Self>(bytes)
+
+        // We could alternatively just clone and modify the version here.
+        let Self {
+            version: _, // ignore version
+            offloaded_timelines,
+        } = self;
+
+        offloaded_timelines == &other.offloaded_timelines
     }
 
-    pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
+    /// Decodes a manifest from JSON.
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice(bytes)
+    }
+
+    /// Encodes a manifest as JSON.
+    pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
         serde_json::to_vec(self)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use utils::id::TimelineId;
+
+    use super::*;
+
+    /// Empty manifests should be parsed. Version is required.
+    #[test]
+    fn parse_empty() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 0
+         }"#;
+        let expected = TenantManifest {
+            version: 0,
+            offloaded_timelines: Vec::new(),
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+
+    /// Unknown fields should be ignored, for forwards compatibility.
+    #[test]
+    fn parse_unknown_fields() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 1,
+             "foo": "bar"
+         }"#;
+        let expected = TenantManifest {
+            version: 1,
+            offloaded_timelines: Vec::new(),
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+
+    /// v1 manifests should be parsed, for backwards compatibility.
+    #[test]
+    fn parse_v1() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 1,
+             "offloaded_timelines": [
+                 {
+                     "timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "archived_at": "2025-03-07T11:07:11.373105434"
+                 },
+                 {
+                     "timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
+                     "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "ancestor_retain_lsn": "0/1F79038",
+                     "archived_at": "2025-03-05T11:10:22.257901390"
+                 }
+             ]
+         }"#;
+        let expected = TenantManifest {
+            version: 1,
+            offloaded_timelines: vec![
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
+                    ancestor_timeline_id: None,
+                    ancestor_retain_lsn: None,
+                    archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
+                },
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
+                    ancestor_timeline_id: Some(TimelineId::from_str(
+                        "5c4df612fd159e63c1b7853fe94d97da",
+                    )?),
+                    ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
+                    archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
+                },
+            ],
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 7d9f47665a..89f6136530 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -61,6 +61,7 @@ pub(crate) async fn upload_index_part(
         .await
         .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
+
 /// Serializes and uploads the given tenant manifest data to the remote storage.
 pub(crate) async fn upload_tenant_manifest(
     storage: &GenericRemoteStorage,
@@ -76,16 +77,14 @@ pub(crate) async fn upload_tenant_manifest(
     });
     pausable_failpoint!("before-upload-manifest-pausable");
 
-    let serialized = tenant_manifest.to_json_bytes()?;
-    let serialized = Bytes::from(serialized);
-
-    let tenant_manifest_site = serialized.len();
-
+    let serialized = Bytes::from(tenant_manifest.to_json_bytes()?);
+    let tenant_manifest_size = serialized.len();
     let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+
     storage
         .upload_storage_object(
             futures::stream::once(futures::future::ready(Ok(serialized))),
-            tenant_manifest_site,
+            tenant_manifest_size,
             &remote_path,
             cancel,
         )
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 740f590735..64fcf1fe0d 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -410,10 +410,13 @@ impl DeleteTimelineFlow {
         // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
         // However, we handle this case in tenant loading code so the next time we attach, the issue is
         // resolved.
-        tenant.store_tenant_manifest().await.map_err(|e| match e {
-            TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
-            _ => DeleteTimelineError::Other(e.into()),
-        })?;
+        tenant
+            .maybe_upload_tenant_manifest()
+            .await
+            .map_err(|err| match err {
+                TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
+                err => DeleteTimelineError::Other(err.into()),
+            })?;
 
         *guard = Self::Finished;
 
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 43ffaa6aab..f46f1676c9 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -111,7 +111,7 @@ pub(crate) async fn offload_timeline(
     // at the next restart attach it again.
     // For that to happen, we'd need to make the manifest reflect our *intended* state,
     // not our actual state of offloaded timelines.
-    tenant.store_tenant_manifest().await?;
+    tenant.maybe_upload_tenant_manifest().await?;
 
     tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 17abe1ea75..4360b42d68 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -318,7 +318,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
         neon_env_builder.pageserver_remote_storage,
         prefix=f"tenants/{str(tenant_id)}/",
     )
-    assert_prefix_empty(
+    assert_prefix_not_empty(
         neon_env_builder.pageserver_remote_storage,
         prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
     )
@@ -387,7 +387,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
             sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
             assert sum == sum_again
 
-        assert_prefix_empty(
+        assert_prefix_not_empty(
             neon_env_builder.pageserver_remote_storage,
             prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
         )
@@ -924,7 +924,7 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
         neon_env_builder.pageserver_remote_storage,
         prefix=f"tenants/{str(tenant_id)}/",
     )
-    assert_prefix_empty(
+    assert_prefix_not_empty(
         neon_env_builder.pageserver_remote_storage,
         prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
     )

From 0875dacce05c6a66b9d824f5474d73e67d11f3e3 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 7 Apr 2025 17:19:06 -0400
Subject: [PATCH 069/140] fix(pageserver): more aggressively yield in
 gc-compaction, degrade errors to warnings (#11469)

## Problem

Fix various small issues discovered during gc-compaction rollout.

## Summary of changes

- Log level changes: if errors are from gc-compaction, fire a warning
instead of errors or critical errors.
- Yield to L0 compaction more aggressively. Instead of checking every 1k
keys, we check on every key. Sometimes a single key reconstruct takes a
long time.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/tasks.rs               |  9 +++++-
 pageserver/src/tenant/timeline.rs            | 29 ++++++++++++++++++--
 pageserver/src/tenant/timeline/compaction.rs | 10 +++----
 3 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 034e5f8c91..54588e788c 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -268,7 +268,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 error_run += 1;
                 let backoff =
                     exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
-                log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled());
+                log_compaction_error(
+                    &err,
+                    Some((error_run, backoff)),
+                    cancel.is_cancelled(),
+                    false,
+                );
                 continue;
             }
         }
@@ -285,6 +290,7 @@ pub(crate) fn log_compaction_error(
     err: &CompactionError,
     retry_info: Option<(u32, Duration)>,
     task_cancelled: bool,
+    degrade_to_warning: bool,
 ) {
     use CompactionError::*;
 
@@ -333,6 +339,7 @@ pub(crate) fn log_compaction_error(
         }
     } else {
         match level {
+            Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
             Level::ERROR => error!("Compaction failed: {err:#}"),
             Level::INFO => info!("Compaction failed: {err:#}"),
             level => unimplemented!("unexpected level {level:?}"),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6ca3704bc1..5174da0f43 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1940,7 +1940,7 @@ impl Timeline {
             )
             .await;
         if let Err(err) = &res {
-            log_compaction_error(err, None, cancel.is_cancelled());
+            log_compaction_error(err, None, cancel.is_cancelled(), false);
         }
         res
     }
@@ -6353,10 +6353,33 @@ impl Timeline {
 
     /// Reconstruct a value, using the given base image and WAL records in 'data'.
     async fn reconstruct_value(
+        &self,
+        key: Key,
+        request_lsn: Lsn,
+        data: ValueReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.reconstruct_value_inner(key, request_lsn, data, false)
+            .await
+    }
+
+    /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
+    /// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
+    async fn reconstruct_value_wo_critical_error(
+        &self,
+        key: Key,
+        request_lsn: Lsn,
+        data: ValueReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.reconstruct_value_inner(key, request_lsn, data, true)
+            .await
+    }
+
+    async fn reconstruct_value_inner(
         &self,
         key: Key,
         request_lsn: Lsn,
         mut data: ValueReconstructState,
+        no_critical_error: bool,
     ) -> Result<Bytes, PageReconstructError> {
         // Perform WAL redo if needed
         data.records.reverse();
@@ -6413,7 +6436,9 @@ impl Timeline {
                     Ok(img) => img,
                     Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                     Err(walredo::Error::Other(err)) => {
-                        critical!("walredo failure during page reconstruction: {err:?}");
+                        if !no_critical_error {
+                            critical!("walredo failure during page reconstruction: {err:?}");
+                        }
                         return Err(PageReconstructError::WalRedo(
                             err.context("reconstruct a page image"),
                         ));
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 73f6691f14..8403c0a7d9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -448,7 +448,7 @@ impl GcCompactionQueue {
     ) -> Result<CompactionOutcome, CompactionError> {
         let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await;
         if let Err(err) = &res {
-            log_compaction_error(err, None, cancel.is_cancelled());
+            log_compaction_error(err, None, cancel.is_cancelled(), true);
         }
         match res {
             Ok(res) => Ok(res),
@@ -2410,7 +2410,9 @@ impl Timeline {
                 } else {
                     lsn_split_points[i]
                 };
-                let img = self.reconstruct_value(key, request_lsn, state).await?;
+                let img = self
+                    .reconstruct_value_wo_critical_error(key, request_lsn, state)
+                    .await?;
                 Some((request_lsn, img))
             } else {
                 None
@@ -3106,8 +3108,6 @@ impl Timeline {
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
-        let mut keys_processed = 0;
-
         while let Some(((key, lsn, val), desc)) = merge_iter
             .next_with_trace()
             .await
@@ -3118,9 +3118,7 @@ impl Timeline {
                 return Err(CompactionError::ShuttingDown);
             }
 
-            keys_processed += 1;
             let should_yield = yield_for_l0
-                && keys_processed % 1000 == 0
                 && self
                     .l0_compaction_trigger
                     .notified()

From b2a0b2e9ddb0db0621c6edaaf9efd1a10a00f82d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 8 Apr 2025 06:52:50 +0300
Subject: [PATCH 070/140] Skip hole tags in local_cache view (#11454)

## Problem

If the local file cache is shrunk, so that we punch some holes in the
underlying file, the local_cache view displays the holes incorrectly.
See https://github.com/neondatabase/neon/issues/10770

## Summary of changes

Skip hole tags in the local_cache view.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c                 | 29 ++++++++++++++++----------
 test_runner/regress/test_lfc_resize.py | 22 +++++++++++++++++++
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 97a4c39e49..2505fcb847 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1563,8 +1563,12 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
-						n_pages += GET_STATE(entry, i) == AVAILABLE;
+					/* Skip hole tags */
+					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
+					{
+						for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+							n_pages += GET_STATE(entry, i) == AVAILABLE;
+					}
 				}
 			}
 		}
@@ -1592,16 +1596,19 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			{
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (GET_STATE(entry, i) == AVAILABLE)
+					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
 					{
-						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
-						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
-						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
-						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
-						fctx->record[n].forknum = entry->key.forkNum;
-						fctx->record[n].blocknum = entry->key.blockNum + i;
-						fctx->record[n].accesscount = entry->access_count;
-						n += 1;
+						if (GET_STATE(entry, i) == AVAILABLE)
+						{
+							fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+							fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
+							fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
+							fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
+							fctx->record[n].forknum = entry->key.forkNum;
+							fctx->record[n].blocknum = entry->key.blockNum + i;
+							fctx->record[n].accesscount = entry->access_count;
+							n += 1;
+						}
 					}
 				}
 			}
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 51074751e0..83fd3aa719 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -49,6 +49,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     conn = endpoint.connect()
     cur = conn.cursor()
 
+    cur.execute("create extension neon")
+
     def get_lfc_size() -> tuple[int, int]:
         lfc_file_path = endpoint.lfc_path()
         lfc_file_size = lfc_file_path.stat().st_size
@@ -103,3 +105,23 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
         time.sleep(1)
 
     assert int(lfc_file_blocks) <= 128 * 1024
+
+    # Now test that number of rows returned by local_cache is the same as file_cache_used_pages.
+    # Perform several iterations to make cache cache content stabilized.
+    nretries = 10
+    while True:
+        cur.execute("select count(*) from local_cache")
+        local_cache_size = cur.fetchall()[0][0]
+
+        cur.execute(
+            "select lfc_value::bigint FROM neon_lfc_stats where lfc_key='file_cache_used_pages'"
+        )
+        used_pages = cur.fetchall()[0][0]
+
+        if local_cache_size == used_pages or nretries == 0:
+            break
+
+        nretries = nretries - 1
+        time.sleep(1)
+
+    assert local_cache_size == used_pages

From 7ffcbfde9ae274fceca9601d11b97393c8414e36 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 8 Apr 2025 12:03:56 +0300
Subject: [PATCH 071/140] refactor: Move LFC function prototypes to separate
 header file (#11458)

Also, move the call to the lfc_init() function. It was weird to have it
in libpagestore.c, when libpagestore.c otherwise had nothing to do with
the LFC. Move it directly into _PG_init()
---
 pgxn/neon/file_cache.c       |  2 +-
 pgxn/neon/file_cache.h       | 52 ++++++++++++++++++++++++++++++++++++
 pgxn/neon/libpagestore.c     |  2 --
 pgxn/neon/neon.c             |  2 ++
 pgxn/neon/neon.h             | 10 +++++++
 pgxn/neon/pagestore_client.h | 42 -----------------------------
 pgxn/neon/pagestore_smgr.c   |  1 +
 7 files changed, 66 insertions(+), 45 deletions(-)
 create mode 100644 pgxn/neon/file_cache.h

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 2505fcb847..8c2990e57a 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -21,7 +21,6 @@
 #include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "pagestore_client.h"
 #include "common/hashfn.h"
 #include "pgstat.h"
 #include "port/pg_iovec.h"
@@ -43,6 +42,7 @@
 
 #include "hll.h"
 #include "bitmap.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
new file mode 100644
index 0000000000..849558b83d
--- /dev/null
+++ b/pgxn/neon/file_cache.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * file_cache.h
+ *	  Local File Cache definitions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FILE_CACHE_h
+#define FILE_CACHE_h
+
+#include "neon_pgversioncompat.h"
+
+/* GUCs */
+extern bool lfc_store_prefetch_result;
+
+/* functions for local file cache */
+extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
+					   BlockNumber blkno, const void *const *buffers,
+					   BlockNumber nblocks);
+/* returns number of blocks read, with one bit set in *read for each  */
+extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
+							BlockNumber blkno, void **buffers,
+							BlockNumber nblocks, bits8 *mask);
+
+extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno);
+extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno, int nblocks, bits8 *bitmap);
+extern void lfc_init(void);
+extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+						 const void* buffer, XLogRecPtr lsn);
+
+
+static inline bool
+lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		 void *buffer)
+{
+	bits8		rv = 0;
+	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
+}
+
+static inline void
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		  const void *buffer)
+{
+	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
+}
+
+#endif							/* FILE_CACHE_H */
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 60b2249461..9ea708f29a 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1475,6 +1475,4 @@ pg_init_libpagestore(void)
 	}
 
 	memset(page_servers, 0, sizeof(page_servers));
-
-	lfc_init();
 }
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 081025e2d5..69da83f3fb 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -29,6 +29,7 @@
 #include "utils/guc_tables.h"
 
 #include "extension_server.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "control_plane_connector.h"
@@ -434,6 +435,7 @@ _PG_init(void)
 #endif
 
 	pg_init_libpagestore();
+	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();
 
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index e2fa136e37..a4339c9776 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -47,6 +47,16 @@ extern uint32		WAIT_EVENT_NEON_WAL_DL;
 #define WAIT_EVENT_NEON_WAL_DL			WAIT_EVENT_WAL_READ
 #endif
 
+
+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag,                                  \
+										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
+										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
 extern void pagestore_smgr_init(void);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index a2e3d57e47..6ddad21362 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -58,14 +58,6 @@ typedef struct
 
 #define messageTag(m) (((const NeonMessage *)(m))->tag)
 
-#define NEON_TAG "[NEON_SMGR] "
-#define neon_log(tag, fmt, ...) ereport(tag,                                  \
-										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
-#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
-														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
-														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
-
 /* SLRUs downloadable from page server */
 typedef enum {
 	SLRU_CLOG,
@@ -234,7 +226,6 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;
-extern bool lfc_store_prefetch_result;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
@@ -285,37 +276,4 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb
 extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
 
-/* functions for local file cache */
-extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
-					   BlockNumber blkno, const void *const *buffers,
-					   BlockNumber nblocks);
-/* returns number of blocks read, with one bit set in *read for each  */
-extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
-							BlockNumber blkno, void **buffers,
-							BlockNumber nblocks, bits8 *mask);
-
-extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno);
-extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno, int nblocks, bits8 *bitmap);
-extern void lfc_init(void);
-extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-						 const void* buffer, XLogRecPtr lsn);
-
-
-static inline bool
-lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		 void *buffer)
-{
-	bits8		rv = 0;
-	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
-}
-
-static inline void
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		  const void *buffer)
-{
-	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
-}
-
 #endif							/* PAGESTORE_CLIENT_H */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index eb8df11923..6d58f4f28f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -65,6 +65,7 @@
 #include "utils/timeout.h"
 
 #include "bitmap.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"

From 8a6d0dccaacc8a37c98c12fadb7bb3a0fb19ed4a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Apr 2025 12:01:15 +0200
Subject: [PATCH 072/140] build(deps): bump tokio from 1.38.0 to 1.38.2 in
 /test_runner/pg_clients/rust/tokio-postgres in the cargo group across 1
 directory (#11478)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 test_runner/pg_clients/rust/tokio-postgres/Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 027be03707..22c0e461b5 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -808,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.38.0"
+version = "1.38.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
+checksum = "68722da18b0fc4a05fdc1120b302b82051265792a1e1b399086e9b204b10ad3d"
 dependencies = [
  "backtrace",
  "bytes",

From 7791a49dd4e40cdaad004d998f956e0730666853 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Tue, 8 Apr 2025 14:03:38 +0400
Subject: [PATCH 073/140] fix(tests): improve test_scrubber_tenant_snapshot
 stability (#11471)

## Problem
`test_scrubber_tenant_snapshot` is flaky with `request was dropped`
errors. More details are in the issue.
- Closes: https://github.com/neondatabase/neon/issues/11278

## Summary of changes
- Disable shard scheduling during pageservers restart
- Add `reconcile_until_idle` in the end of the test
---
 storage_controller/src/service.rs            |  2 +-
 test_runner/regress/test_storage_scrubber.py | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 50f642deaf..5e53051727 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7270,7 +7270,7 @@ impl Service {
             }
 
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
-            // dirty, spawn another rone
+            // dirty, spawn another one
             if self
                 .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
                 .is_some()
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 70af299de3..03cd133ccb 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -75,7 +75,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
         tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
 
     # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable
-    # is it won't overlap with migrations
+    # as it won't overlap with migrations
     env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
 
     output_path = neon_env_builder.test_output_dir / "snapshot"
@@ -87,6 +87,13 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
 
     workload.stop()
 
+    # Disable scheduling, so the storage controller doesn't migrate shards around
+    # while we are stopping pageservers
+    env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
+    env.storage_controller.allowed_errors.extend(
+        [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"]
+    )
+
     # Stop pageservers
     for pageserver in env.pageservers:
         pageserver.stop()
@@ -127,9 +134,16 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     for pageserver in env.pageservers:
         pageserver.start()
 
+    # Turn scheduling back on.
+    # We don't care about optimizations, so enable only essential scheduling
+    env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"})
+
     # Check we can read everything
     workload.validate()
 
+    # Reconcile to avoid a race between test shutdown and background reconciliation (#11278)
+    env.storage_controller.reconcile_until_idle()
+
 
 def drop_local_state(env: NeonEnv, tenant_id: TenantId):
     env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})

From a7142f3bc6a5bb69ac640a9af23231e6264dd971 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Tue, 8 Apr 2025 17:03:09 +0300
Subject: [PATCH 074/140] Configure rsyslog for logs export using the spec
 (#11338)

- Work on https://github.com/neondatabase/cloud/issues/24896
- Cplane part https://github.com/neondatabase/cloud/pull/26808

Instead of reconfiguring rsyslog via an API endpoint [we have
agreed](https://neondb.slack.com/archives/C04DGM6SMTM/p1743513810964509?thread_ts=1743170369.865859&cid=C04DGM6SMTM)
to have a new `logs_export_host` field as part of the compute spec.

---------

Co-authored-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/compute.rs               | 15 +++++------
 compute_tools/src/config.rs                |  4 +--
 compute_tools/src/http/openapi_spec.yaml   | 30 ----------------------
 compute_tools/src/http/routes/configure.rs | 27 +------------------
 compute_tools/src/http/server.rs           |  1 -
 compute_tools/src/rsyslog.rs               | 20 +--------------
 control_plane/src/endpoint.rs              |  1 +
 libs/compute_api/src/requests.rs           |  6 -----
 libs/compute_api/src/spec.rs               |  7 ++---
 9 files changed, 15 insertions(+), 96 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 70b91c781a..9dfcde1dbc 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -661,15 +661,8 @@ impl ComputeNode {
         }
 
         // Configure and start rsyslog for Postgres logs export
-        if self.has_feature(ComputeFeature::PostgresLogsExport) {
-            if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
-                let host = PostgresLogsRsyslogConfig::default_host(project_id);
-                let conf = PostgresLogsRsyslogConfig::new(Some(&host));
-                configure_postgres_logs_export(conf)?;
-            } else {
-                warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
-            }
-        }
+        let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref());
+        configure_postgres_logs_export(conf)?;
 
         // Launch remaining service threads
         let _monitor_handle = launch_monitor(self);
@@ -1573,6 +1566,10 @@ impl ComputeNode {
             });
         }
 
+        // Reconfigure rsyslog for Postgres logs export
+        let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref());
+        configure_postgres_logs_export(conf)?;
+
         // Write new config
         let pgdata_path = Path::new(&self.params.pgdata);
         config::write_postgres_conf(
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 614ab076ff..92939f816c 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -7,7 +7,7 @@ use std::io::prelude::*;
 use std::path::Path;
 
 use compute_api::responses::TlsConfig;
-use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
+use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
 
 use crate::pg_helpers::{
     GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
@@ -255,7 +255,7 @@ pub fn write_postgres_conf(
 
     // We need Postgres to send logs to rsyslog so that we can forward them
     // further to customers' log aggregation systems.
-    if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
+    if spec.logs_export_host.is_some() {
         writeln!(file, "log_destination='stderr,syslog'")?;
     }
 
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 7c8f72440f..bbdb7d0917 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -306,36 +306,6 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
-  /configure_telemetry:
-    post:
-      tags:
-        - Configure
-      summary: Configure rsyslog
-      description: |
-        This API endpoint configures rsyslog to forward Postgres logs
-        to a specified otel collector.
-      operationId: configureTelemetry
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                logs_export_host:
-                  type: string
-                  description: |
-                    Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
-                    Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
-      responses:
-        204:
-          description: "Telemetry configured successfully"
-        500:
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-
 components:
   securitySchemes:
     JWT:
diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs
index 5c9dd22c3d..3c5a6a6d41 100644
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -1,11 +1,9 @@
 use std::sync::Arc;
 
-use axum::body::Body;
 use axum::extract::State;
 use axum::response::Response;
-use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
+use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
-use compute_api::spec::ComputeFeature;
 use http::StatusCode;
 use tokio::task;
 use tracing::info;
@@ -13,7 +11,6 @@ use tracing::info;
 use crate::compute::{ComputeNode, ParsedSpec};
 use crate::http::JsonResponse;
 use crate::http::extract::Json;
-use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};
 
 // Accept spec in JSON format and request compute configuration. If anything
 // goes wrong after we set the compute status to `ConfigurationPending` and
@@ -95,25 +92,3 @@ pub(in crate::http) async fn configure(
 
     JsonResponse::success(StatusCode::OK, body)
 }
-
-pub(in crate::http) async fn configure_telemetry(
-    State(compute): State<Arc<ComputeNode>>,
-    request: Json<ConfigureTelemetryRequest>,
-) -> Response {
-    if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "Postgres logs export feature is not enabled".to_string(),
-        );
-    }
-
-    let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
-    if let Err(err) = configure_postgres_logs_export(conf) {
-        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
-    }
-
-    Response::builder()
-        .status(StatusCode::NO_CONTENT)
-        .body(Body::from(""))
-        .unwrap()
-}
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 179369e3ef..10f767e97c 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -87,7 +87,6 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                 let authenticated_router = Router::<Arc<ComputeNode>>::new()
                     .route("/check_writability", post(check_writability::is_writable))
                     .route("/configure", post(configure::configure))
-                    .route("/configure_telemetry", post(configure::configure_telemetry))
                     .route("/database_schema", get(database_schema::get_schema_dump))
                     .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
                     .route("/insights", get(insights::get_insights))
diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs
index 80594db3f1..ba08302df2 100644
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -119,16 +119,9 @@ impl<'a> PostgresLogsRsyslogConfig<'a> {
         };
         Ok(config_content)
     }
-
-    /// Returns the default host for otel collector that receives Postgres logs
-    pub fn default_host(project_id: &str) -> String {
-        format!(
-            "config-{}-collector.neon-telemetry.svc.cluster.local:10514",
-            project_id
-        )
-    }
 }
 
+/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog.
 pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
     let new_config = conf.build()?;
     let current_config = PostgresLogsRsyslogConfig::current_config()?;
@@ -261,16 +254,5 @@ mod tests {
             let res = conf.build();
             assert!(res.is_err());
         }
-
-        {
-            // Verify config with default host
-            let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
-            let conf = PostgresLogsRsyslogConfig::new(Some(&host));
-            let res = conf.build();
-            assert!(res.is_ok());
-            let conf_str = res.unwrap();
-            assert!(conf_str.contains(r#"shy-breeze-123"#));
-            assert!(conf_str.contains(r#"port="10514""#));
-        }
     }
 }
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b46d616827..3137bde161 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -670,6 +670,7 @@ impl Endpoint {
             reconfigure_concurrency: self.reconfigure_concurrency,
             drop_subscriptions_before_start: self.drop_subscriptions_before_start,
             audit_log_level: ComputeAudit::Disabled,
+            logs_export_host: None::<String>,
         };
 
         // this strange code is needed to support respec() in tests
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index d88451c549..3fbdfcf83f 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -30,9 +30,3 @@ pub struct SetRoleGrantsRequest {
     pub privileges: Vec<Privilege>,
     pub role: PgIdent,
 }
-
-/// Request of the /configure_telemetry API
-#[derive(Debug, Deserialize, Serialize)]
-pub struct ConfigureTelemetryRequest {
-    pub logs_export_host: Option<String>,
-}
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index cff1f4c89a..994a665a88 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -168,6 +168,10 @@ pub struct ComputeSpec {
     /// Extensions should be present in shared_preload_libraries
     #[serde(default)]
     pub audit_log_level: ComputeAudit,
+
+    /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
+    /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
+    pub logs_export_host: Option<String>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -179,9 +183,6 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
-    /// Allow to configure rsyslog for Postgres logs export
-    PostgresLogsExport,
-
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.

From 6138d61592f0bfbb1d8f3f033d3fec2983ae7936 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Tue, 8 Apr 2025 15:54:53 +0100
Subject: [PATCH 075/140] Object storage proxy (#11357)

Service targeted for storing and retrieving LFC prewarm data.
Can be used for proxying S3 access for Postgres extensions like
pg_mooncake as well.

Requests must include a Bearer JWT token.
Token is validated using a pemfile (should be passed in infra/).

Note: app is not tolerant to extra trailing slashes, see app.rs
`delete_prefix` test for comments.

Resolves: https://github.com/neondatabase/cloud/issues/26342
Unrelated changes: gate a `rename_noreplace` feature and disable it in
`remote_storage` so as `object_storage` can be built with musl
---
 .dockerignore                                 |   1 +
 Cargo.lock                                    |  55 +-
 Cargo.toml                                    |   2 +
 Dockerfile                                    |   2 +
 control_plane/src/bin/neon_local.rs           |  83 ++-
 control_plane/src/lib.rs                      |   1 +
 control_plane/src/local_env.rs                |  36 ++
 control_plane/src/object_storage.rs           | 107 ++++
 libs/remote_storage/Cargo.toml                |   2 +-
 libs/utils/Cargo.toml                         |   5 +-
 libs/utils/src/fs_ext.rs                      |   2 +
 libs/utils/src/fs_ext/rename_noreplace.rs     |   4 +-
 libs/utils/src/signals.rs                     |  29 +
 object_storage/Cargo.toml                     |  28 +
 object_storage/src/app.rs                     | 561 ++++++++++++++++++
 object_storage/src/lib.rs                     | 344 +++++++++++
 object_storage/src/main.rs                    |  65 ++
 pageserver/src/bin/pageserver.rs              |  28 +-
 test_runner/fixtures/neon_cli.py              |  13 +
 test_runner/fixtures/neon_fixtures.py         |  29 +
 test_runner/regress/test_neon_cli.py          |   7 +-
 test_runner/regress/test_object_storage.py    |  56 ++
 .../regress/test_storage_controller.py        |   2 +
 23 files changed, 1424 insertions(+), 38 deletions(-)
 create mode 100644 control_plane/src/object_storage.rs
 create mode 100644 object_storage/Cargo.toml
 create mode 100644 object_storage/src/app.rs
 create mode 100644 object_storage/src/lib.rs
 create mode 100644 object_storage/src/main.rs
 create mode 100644 test_runner/regress/test_object_storage.py

diff --git a/.dockerignore b/.dockerignore
index 9fafc2e4ba..ffa72eaf51 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,6 +19,7 @@
 !pageserver/
 !pgxn/
 !proxy/
+!object_storage/
 !storage_scrubber/
 !safekeeper/
 !storage_broker/
diff --git a/Cargo.lock b/Cargo.lock
index dbbf2c3357..aea8924f4f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3991,6 +3991,33 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "object_storage"
+version = "0.0.1"
+dependencies = [
+ "anyhow",
+ "axum",
+ "axum-extra",
+ "camino",
+ "camino-tempfile",
+ "futures",
+ "http-body-util",
+ "itertools 0.10.5",
+ "jsonwebtoken",
+ "prometheus",
+ "rand 0.8.5",
+ "remote_storage",
+ "serde",
+ "serde_json",
+ "test-log",
+ "tokio",
+ "tokio-util",
+ "tower 0.5.2",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.20.2"
@@ -4693,7 +4720,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
  "base64 0.22.1",
  "byteorder",
@@ -4727,7 +4754,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
  "bytes",
  "chrono",
@@ -6925,6 +6952,28 @@ dependencies = [
  "syn 2.0.100",
 ]
 
+[[package]]
+name = "test-log"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f"
+dependencies = [
+ "env_logger",
+ "test-log-macros",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "test-log-macros"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -7172,7 +7221,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 1f605681db..d957fa9070 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -40,6 +40,7 @@ members = [
     "libs/proxy/postgres-protocol2",
     "libs/proxy/postgres-types2",
     "libs/proxy/tokio-postgres2",
+    "object_storage",
 ]
 
 [workspace.package]
@@ -208,6 +209,7 @@ tracing-opentelemetry = "0.28"
 tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
+test-log = { version = "0.2.17", default-features = false, features = ["log"] }
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
diff --git a/Dockerfile b/Dockerfile
index 01540e1925..848bfab921 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -89,6 +89,7 @@ RUN set -e \
       --bin storage_broker  \
       --bin storage_controller  \
       --bin proxy  \
+      --bin object_storage \
       --bin neon_local \
       --bin storage_scrubber \
       --locked --release
@@ -121,6 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
 
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 747268f80b..99f0d374c1 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -20,8 +20,10 @@ use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{
     InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
-    SafekeeperConf,
+    ObjectStorageConf, SafekeeperConf,
 };
+use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
+use control_plane::object_storage::ObjectStorage;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
@@ -91,6 +93,8 @@ enum NeonLocalCmd {
     #[command(subcommand)]
     Safekeeper(SafekeeperCmd),
     #[command(subcommand)]
+    ObjectStorage(ObjectStorageCmd),
+    #[command(subcommand)]
     Endpoint(EndpointCmd),
     #[command(subcommand)]
     Mappings(MappingsCmd),
@@ -454,6 +458,32 @@ enum SafekeeperCmd {
     Restart(SafekeeperRestartCmdArgs),
 }
 
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage object storage")]
+enum ObjectStorageCmd {
+    Start(ObjectStorageStartCmd),
+    Stop(ObjectStorageStopCmd),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start object storage")]
+struct ObjectStorageStartCmd {
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop object storage")]
+struct ObjectStorageStopCmd {
+    #[arg(value_enum, default_value = "fast")]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    stop_mode: StopMode,
+}
+
 #[derive(clap::Args)]
 #[clap(about = "Start local safekeeper")]
 struct SafekeeperStartCmdArgs {
@@ -759,6 +789,7 @@ fn main() -> Result<()> {
             }
             NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
             NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
+            NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
             NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
             NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
         };
@@ -975,6 +1006,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                     }
                 })
                 .collect(),
+            object_storage: ObjectStorageConf {
+                port: OBJECT_STORAGE_DEFAULT_PORT,
+            },
             pg_distrib_dir: None,
             neon_distrib_dir: None,
             default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
@@ -1683,6 +1717,41 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
+async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
+    use ObjectStorageCmd::*;
+    let storage = ObjectStorage::from_env(env);
+
+    // In tests like test_forward_compatibility or test_graceful_cluster_restart
+    // old neon binaries (without object_storage) are present
+    if !storage.bin.exists() {
+        eprintln!(
+            "{} binary not found. Ignore if this is a compatibility test",
+            storage.bin
+        );
+        return Ok(());
+    }
+
+    match subcmd {
+        Start(ObjectStorageStartCmd { start_timeout }) => {
+            if let Err(e) = storage.start(start_timeout).await {
+                eprintln!("object_storage start failed: {e}");
+                exit(1);
+            }
+        }
+        Stop(ObjectStorageStopCmd { stop_mode }) => {
+            let immediate = match stop_mode {
+                StopMode::Fast => false,
+                StopMode::Immediate => true,
+            };
+            if let Err(e) = storage.stop(immediate) {
+                eprintln!("proxy stop failed: {e}");
+                exit(1);
+            }
+        }
+    };
+    Ok(())
+}
+
 async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
     match subcmd {
         StorageBrokerCmd::Start(args) => {
@@ -1777,6 +1846,13 @@ async fn handle_start_all_impl(
                     .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
             });
         }
+
+        js.spawn(async move {
+            ObjectStorage::from_env(env)
+                .start(&retry_timeout)
+                .await
+                .map_err(|e| e.context("start object_storage"))
+        });
     })();
 
     let mut errors = Vec::new();
@@ -1874,6 +1950,11 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
         }
     }
 
+    let storage = ObjectStorage::from_env(env);
+    if let Err(e) = storage.stop(immediate) {
+        eprintln!("object_storage stop failed: {:#}", e);
+    }
+
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver.stop(immediate) {
diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs
index 2af272f388..2d9fe2c807 100644
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -10,6 +10,7 @@ mod background_process;
 pub mod broker;
 pub mod endpoint;
 pub mod local_env;
+pub mod object_storage;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 3f3794c0ee..2616afbb16 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -18,6 +18,7 @@ use serde::{Deserialize, Serialize};
 use utils::auth::{Claims, encode_from_key_file};
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 
+use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;
 
@@ -55,6 +56,7 @@ pub struct LocalEnv {
 
     // used to issue tokens during e.g pg start
     pub private_key_path: PathBuf,
+    pub public_key_path: PathBuf,
 
     pub broker: NeonBroker,
 
@@ -68,6 +70,8 @@ pub struct LocalEnv {
 
     pub safekeepers: Vec<SafekeeperConf>,
 
+    pub object_storage: ObjectStorageConf,
+
     // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
     pub control_plane_api: Url,
@@ -95,6 +99,7 @@ pub struct OnDiskConfig {
     pub neon_distrib_dir: PathBuf,
     pub default_tenant_id: Option<TenantId>,
     pub private_key_path: PathBuf,
+    pub public_key_path: PathBuf,
     pub broker: NeonBroker,
     pub storage_controller: NeonStorageControllerConf,
     #[serde(
@@ -103,6 +108,7 @@ pub struct OnDiskConfig {
     )]
     pub pageservers: Vec<PageServerConf>,
     pub safekeepers: Vec<SafekeeperConf>,
+    pub object_storage: ObjectStorageConf,
     pub control_plane_api: Option<Url>,
     pub control_plane_hooks_api: Option<Url>,
     pub control_plane_compute_hook_api: Option<Url>,
@@ -136,11 +142,18 @@ pub struct NeonLocalInitConf {
     pub storage_controller: Option<NeonStorageControllerConf>,
     pub pageservers: Vec<NeonLocalInitPageserverConf>,
     pub safekeepers: Vec<SafekeeperConf>,
+    pub object_storage: ObjectStorageConf,
     pub control_plane_api: Option<Url>,
     pub control_plane_hooks_api: Option<Url>,
     pub generate_local_ssl_certs: bool,
 }
 
+#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct ObjectStorageConf {
+    pub port: u16,
+}
+
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -398,6 +411,10 @@ impl LocalEnv {
         self.pg_dir(pg_version, "lib")
     }
 
+    pub fn object_storage_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("object_storage")
+    }
+
     pub fn pageserver_bin(&self) -> PathBuf {
         self.neon_distrib_dir.join("pageserver")
     }
@@ -431,6 +448,10 @@ impl LocalEnv {
         self.base_data_dir.join("safekeepers").join(data_dir_name)
     }
 
+    pub fn object_storage_data_dir(&self) -> PathBuf {
+        self.base_data_dir.join("object_storage")
+    }
+
     pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
         if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
             Ok(conf)
@@ -582,6 +603,7 @@ impl LocalEnv {
                 neon_distrib_dir,
                 default_tenant_id,
                 private_key_path,
+                public_key_path,
                 broker,
                 storage_controller,
                 pageservers,
@@ -591,6 +613,7 @@ impl LocalEnv {
                 control_plane_compute_hook_api: _,
                 branch_name_mappings,
                 generate_local_ssl_certs,
+                object_storage,
             } = on_disk_config;
             LocalEnv {
                 base_data_dir: repopath.to_owned(),
@@ -598,6 +621,7 @@ impl LocalEnv {
                 neon_distrib_dir,
                 default_tenant_id,
                 private_key_path,
+                public_key_path,
                 broker,
                 storage_controller,
                 pageservers,
@@ -606,6 +630,7 @@ impl LocalEnv {
                 control_plane_hooks_api,
                 branch_name_mappings,
                 generate_local_ssl_certs,
+                object_storage,
             }
         };
 
@@ -705,6 +730,7 @@ impl LocalEnv {
                 neon_distrib_dir: self.neon_distrib_dir.clone(),
                 default_tenant_id: self.default_tenant_id,
                 private_key_path: self.private_key_path.clone(),
+                public_key_path: self.public_key_path.clone(),
                 broker: self.broker.clone(),
                 storage_controller: self.storage_controller.clone(),
                 pageservers: vec![], // it's skip_serializing anyway
@@ -714,6 +740,7 @@ impl LocalEnv {
                 control_plane_compute_hook_api: None,
                 branch_name_mappings: self.branch_name_mappings.clone(),
                 generate_local_ssl_certs: self.generate_local_ssl_certs,
+                object_storage: self.object_storage.clone(),
             },
         )
     }
@@ -797,6 +824,7 @@ impl LocalEnv {
             control_plane_api,
             generate_local_ssl_certs,
             control_plane_hooks_api,
+            object_storage,
         } = conf;
 
         // Find postgres binaries.
@@ -828,6 +856,7 @@ impl LocalEnv {
         )
         .context("generate auth keys")?;
         let private_key_path = PathBuf::from("auth_private_key.pem");
+        let public_key_path = PathBuf::from("auth_public_key.pem");
 
         // create the runtime type because the remaining initialization code below needs
         // a LocalEnv instance op operation
@@ -838,6 +867,7 @@ impl LocalEnv {
             neon_distrib_dir,
             default_tenant_id: Some(default_tenant_id),
             private_key_path,
+            public_key_path,
             broker,
             storage_controller: storage_controller.unwrap_or_default(),
             pageservers: pageservers.iter().map(Into::into).collect(),
@@ -846,6 +876,7 @@ impl LocalEnv {
             control_plane_hooks_api,
             branch_name_mappings: Default::default(),
             generate_local_ssl_certs,
+            object_storage,
         };
 
         if generate_local_ssl_certs {
@@ -873,8 +904,13 @@ impl LocalEnv {
                 .context("pageserver init failed")?;
         }
 
+        ObjectStorage::from_env(&env)
+            .init()
+            .context("object storage init failed")?;
+
         // setup remote remote location for default LocalFs remote storage
         std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+        std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
 
         env.persist_config()
     }
diff --git a/control_plane/src/object_storage.rs b/control_plane/src/object_storage.rs
new file mode 100644
index 0000000000..1a595b7809
--- /dev/null
+++ b/control_plane/src/object_storage.rs
@@ -0,0 +1,107 @@
+use crate::background_process::{self, start_process, stop_process};
+use crate::local_env::LocalEnv;
+use anyhow::anyhow;
+use anyhow::{Context, Result};
+use camino::Utf8PathBuf;
+use std::io::Write;
+use std::time::Duration;
+
+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
+pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
+
+pub struct ObjectStorage {
+    pub bin: Utf8PathBuf,
+    pub data_dir: Utf8PathBuf,
+    pub pemfile: Utf8PathBuf,
+    pub port: u16,
+}
+
+impl ObjectStorage {
+    pub fn from_env(env: &LocalEnv) -> ObjectStorage {
+        ObjectStorage {
+            bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
+            data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
+            pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
+            port: env.object_storage.port,
+        }
+    }
+
+    fn config_path(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.json")
+    }
+
+    fn listen_addr(&self) -> Utf8PathBuf {
+        format!("127.0.0.1:{}", self.port).into()
+    }
+
+    pub fn init(&self) -> Result<()> {
+        println!("Initializing object storage in {:?}", self.data_dir);
+        let parent = self.data_dir.parent().unwrap();
+
+        #[derive(serde::Serialize)]
+        struct Cfg {
+            listen: Utf8PathBuf,
+            pemfile: Utf8PathBuf,
+            local_path: Utf8PathBuf,
+            r#type: String,
+        }
+        let cfg = Cfg {
+            listen: self.listen_addr(),
+            pemfile: parent.join(self.pemfile.clone()),
+            local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
+            r#type: "LocalFs".to_string(),
+        };
+        std::fs::create_dir_all(self.config_path().parent().unwrap())?;
+        std::fs::write(self.config_path(), serde_json::to_string(&cfg)?)
+            .context("write object storage config")?;
+        Ok(())
+    }
+
+    pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
+        println!("Starting s3 proxy at {}", self.listen_addr());
+        std::io::stdout().flush().context("flush stdout")?;
+
+        let process_status_check = || async {
+            tokio::time::sleep(Duration::from_millis(500)).await;
+            let res = reqwest::Client::new()
+                .get(format!("http://{}/metrics", self.listen_addr()))
+                .send()
+                .await;
+            match res {
+                Ok(response) if response.status().is_success() => Ok(true),
+                Ok(_) => Err(anyhow!("Failed to query /metrics")),
+                Err(e) => Err(anyhow!("Failed to check node status: {e}")),
+            }
+        };
+
+        let res = start_process(
+            "object_storage",
+            &self.data_dir.clone().into_std_path_buf(),
+            &self.bin.clone().into_std_path_buf(),
+            vec![self.config_path().to_string()],
+            vec![("RUST_LOG".into(), "debug".into())],
+            background_process::InitialPidFile::Create(self.pid_file()),
+            retry_timeout,
+            process_status_check,
+        )
+        .await;
+        if res.is_err() {
+            eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?);
+        }
+
+        res
+    }
+
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        stop_process(immediate, "object_storage", &self.pid_file())
+    }
+
+    fn log_file(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.log")
+    }
+
+    fn pid_file(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.pid")
+    }
+}
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 7bdf340f74..bd18d80915 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -28,7 +28,7 @@ toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
 metrics.workspace = true
-utils.workspace = true
+utils = { path = "../utils", default-features = false }
 pin-project-lite.workspace = true
 
 azure_core.workspace = true
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 4180602ac7..fd2fa63fd0 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,8 @@ edition.workspace = true
 license.workspace = true
 
 [features]
-default = []
+default = ["rename_noreplace"]
+rename_noreplace = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints"]
@@ -35,7 +36,7 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["signal"] }
 tokio-tar.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = ["serde"] }
diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index a406ab0378..e16edaaa9a 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,7 +3,9 @@ use std::{fs, io, path::Path};
 
 use anyhow::Context;
 
+#[cfg(feature = "rename_noreplace")]
 mod rename_noreplace;
+#[cfg(feature = "rename_noreplace")]
 pub use rename_noreplace::rename_noreplace;
 
 pub trait PathExt {
diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs
index fc6f794b57..d0c07353d0 100644
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -8,7 +8,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
     dst: &P2,
 ) -> nix::Result<()> {
     {
-        #[cfg(target_os = "linux")]
+        #[cfg(all(target_os = "linux", target_env = "gnu"))]
         {
             nix::fcntl::renameat2(
                 None,
@@ -29,7 +29,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
             })??;
             nix::errno::Errno::result(res).map(drop)
         }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        #[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))]
         {
             std::compile_error!("OS does not support no-replace renames");
         }
diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs
index f2be1957c4..426bb65916 100644
--- a/libs/utils/src/signals.rs
+++ b/libs/utils/src/signals.rs
@@ -1,6 +1,8 @@
 pub use signal_hook::consts::TERM_SIGNALS;
 pub use signal_hook::consts::signal::*;
 use signal_hook::iterator::Signals;
+use tokio::signal::unix::{SignalKind, signal};
+use tracing::info;
 
 pub enum Signal {
     Quit,
@@ -36,3 +38,30 @@ impl ShutdownSignals {
         Ok(())
     }
 }
+
+/// Runs in a loop since we want to be responsive to multiple signals
+/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown)
+/// <https://github.com/neondatabase/neon/issues/9740>
+pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
+    let mut sigint = signal(SignalKind::interrupt()).unwrap();
+    let mut sigterm = signal(SignalKind::terminate()).unwrap();
+    let mut sigquit = signal(SignalKind::quit()).unwrap();
+
+    loop {
+        let signal = tokio::select! {
+            _ = sigquit.recv() => {
+                info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
+                std::process::exit(111);
+            }
+            _ = sigint.recv() => "SIGINT",
+            _ = sigterm.recv() => "SIGTERM",
+        };
+
+        if !token.is_cancelled() {
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
+            token.cancel();
+        } else {
+            info!("Got signal {signal}. Already shutting down.");
+        }
+    }
+}
diff --git a/object_storage/Cargo.toml b/object_storage/Cargo.toml
new file mode 100644
index 0000000000..17fbaefe6f
--- /dev/null
+++ b/object_storage/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "object_storage"
+version = "0.0.1"
+edition.workspace = true
+license.workspace = true
+[dependencies]
+anyhow.workspace = true
+axum-extra.workspace = true
+axum.workspace = true
+camino.workspace = true
+futures.workspace = true
+jsonwebtoken.workspace = true
+prometheus.workspace = true
+remote_storage.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+tokio-util.workspace = true
+tokio.workspace = true
+tracing.workspace = true
+utils = { path = "../libs/utils", default-features = false }
+workspace_hack.workspace = true
+[dev-dependencies]
+camino-tempfile.workspace = true
+http-body-util.workspace = true
+itertools.workspace = true
+rand.workspace = true
+test-log.workspace = true
+tower.workspace = true
diff --git a/object_storage/src/app.rs b/object_storage/src/app.rs
new file mode 100644
index 0000000000..7b5627f0db
--- /dev/null
+++ b/object_storage/src/app.rs
@@ -0,0 +1,561 @@
+use anyhow::anyhow;
+use axum::body::{Body, Bytes};
+use axum::response::{IntoResponse, Response};
+use axum::{Router, http::StatusCode};
+use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
+use remote_storage::TimeoutOrCancel;
+use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
+use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+use utils::backoff::retry;
+
+pub fn app(state: Arc<Storage>) -> Router<()> {
+    use axum::routing::{delete as _delete, get as _get};
+    let delete_prefix = _delete(delete_prefix);
+    Router::new()
+        .route(
+            "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
+            _get(get).put(set).delete(delete),
+        )
+        .route(
+            "/{tenant_id}/{timeline_id}/{endpoint_id}",
+            delete_prefix.clone(),
+        )
+        .route("/{tenant_id}/{timeline_id}", delete_prefix.clone())
+        .route("/{tenant_id}", delete_prefix)
+        .route("/metrics", _get(metrics))
+        .route("/status", _get(async || StatusCode::OK.into_response()))
+        .with_state(state)
+}
+
+type Result = anyhow::Result<Response, Response>;
+type State = axum::extract::State<Arc<Storage>>;
+
+const CONTENT_TYPE: &str = "content-type";
+const APPLICATION_OCTET_STREAM: &str = "application/octet-stream";
+const WARN_THRESHOLD: u32 = 3;
+const MAX_RETRIES: u32 = 10;
+
+async fn metrics() -> Result {
+    prometheus::TextEncoder::new()
+        .encode_to_string(&prometheus::gather())
+        .map(|s| s.into_response())
+        .map_err(|e| internal_error(e, "/metrics", "collecting metrics"))
+}
+
+async fn get(S3Path { path }: S3Path, state: State) -> Result {
+    info!(%path, "downloading");
+    let download_err = |e| {
+        if let DownloadError::NotFound = e {
+            info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
+            return not_found(&path);
+        }
+        internal_error(e, &path, "downloading")
+    };
+    let cancel = state.cancel.clone();
+    let opts = &DownloadOpts::default();
+
+    let stream = retry(
+        async || state.storage.download(&path, opts, &cancel).await,
+        DownloadError::is_permanent,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "downloading",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(DownloadError::Cancelled))
+    .map_err(download_err)?
+    .download_stream;
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, APPLICATION_OCTET_STREAM)
+        .body(Body::from_stream(stream))
+        .map_err(|e| internal_error(e, path, "reading response"))
+}
+
+// Best solution for files is multipart upload, but remote_storage doesn't support it,
+// so we can either read Bytes in memory and push at once or forward BodyDataStream to
+// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a
+// guaranteed size() which may produce issues while uploading to s3.
+// So, currently we're going with an in-memory copy plus a boundary to prevent uploading
+// very large files.
+async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result {
+    info!(%path, "uploading");
+    let request_len = bytes.len();
+    let max_len = state.max_upload_file_limit;
+    if request_len > max_len {
+        return Err(bad_request(
+            anyhow!("File size {request_len} exceeds max {max_len}"),
+            "uploading",
+        ));
+    }
+
+    let cancel = state.cancel.clone();
+    let fun = async || {
+        let stream = bytes_to_stream(bytes.clone());
+        state
+            .storage
+            .upload(stream, request_len, &path, None, &cancel)
+            .await
+    };
+    retry(
+        fun,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "uploading",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("uploading cancelled")))
+    .map_err(|e| internal_error(e, path, "reading response"))?;
+    Ok(ok())
+}
+
+async fn delete(S3Path { path }: S3Path, state: State) -> Result {
+    info!(%path, "deleting");
+    let cancel = state.cancel.clone();
+    retry(
+        async || state.storage.delete(&path, &cancel).await,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "deleting",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("deleting cancelled")))
+    .map_err(|e| internal_error(e, path, "deleting"))?;
+    Ok(ok())
+}
+
+async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result {
+    info!(%path, "deleting prefix");
+    let cancel = state.cancel.clone();
+    retry(
+        async || state.storage.delete_prefix(&path, &cancel).await,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "deleting prefix",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("deleting prefix cancelled")))
+    .map_err(|e| internal_error(e, path, "deleting prefix"))?;
+    Ok(ok())
+}
+
+pub async fn check_storage_permissions(
+    client: &GenericRemoteStorage,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    info!("storage permissions check");
+
+    // as_nanos() as multiple instances proxying same bucket may be started at once
+    let now = SystemTime::now()
+        .duration_since(UNIX_EPOCH)?
+        .as_nanos()
+        .to_string();
+
+    let path = RemotePath::from_string(&format!("write_access_{now}"))?;
+    info!(%path, "uploading");
+
+    let body = now.to_string();
+    let stream = bytes_to_stream(Bytes::from(body.clone()));
+    client
+        .upload(stream, body.len(), &path, None, &cancel)
+        .await?;
+
+    use tokio::io::AsyncReadExt;
+    info!(%path, "downloading");
+    let download_opts = DownloadOpts {
+        kind: remote_storage::DownloadKind::Small,
+        ..Default::default()
+    };
+    let mut body_read_buf = Vec::new();
+    let stream = client
+        .download(&path, &download_opts, &cancel)
+        .await?
+        .download_stream;
+    tokio_util::io::StreamReader::new(stream)
+        .read_to_end(&mut body_read_buf)
+        .await?;
+    let body_read = String::from_utf8(body_read_buf)?;
+    if body != body_read {
+        error!(%body, %body_read, "File contents do not match");
+        anyhow::bail!("Read back file doesn't match original")
+    }
+
+    info!(%path, "removing");
+    client.delete(&path, &cancel).await
+}
+
+fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream<Item = std::io::Result<Bytes>> {
+    futures::stream::once(futures::future::ready(Ok(bytes)))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use axum::{body::Body, extract::Request, response::Response};
+    use http_body_util::BodyExt;
+    use itertools::iproduct;
+    use std::env::var;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_log::test as testlog;
+    use tower::{Service, util::ServiceExt};
+    use utils::id::{TenantId, TimelineId};
+
+    // see libs/remote_storage/tests/test_real_s3.rs
+    const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
+    const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET";
+    const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION";
+
+    async fn proxy() -> (Storage, Option<camino_tempfile::Utf8TempDir>) {
+        let cancel = CancellationToken::new();
+        let (dir, storage) = if var(REAL_S3_ENV).is_err() {
+            // tests execute in parallel and we need a new directory for each of them
+            let dir = camino_tempfile::tempdir().unwrap();
+            let fs =
+                remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap();
+            (Some(dir), GenericRemoteStorage::LocalFs(fs))
+        } else {
+            // test_real_s3::create_s3_client is hard to reference, reimplementing here
+            let millis = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis();
+            use rand::Rng;
+            let random = rand::thread_rng().r#gen::<u32>();
+
+            let s3_config = remote_storage::S3Config {
+                bucket_name: var(REAL_S3_BUCKET).unwrap(),
+                bucket_region: var(REAL_S3_REGION).unwrap(),
+                prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
+                endpoint: None,
+                concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: None,
+                upload_storage_class: None,
+            };
+            let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1))
+                .await
+                .unwrap();
+            (None, GenericRemoteStorage::AwsS3(Arc::new(bucket)))
+        };
+
+        let proxy = Storage {
+            auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
+            storage,
+            cancel: cancel.clone(),
+            max_upload_file_limit: usize::MAX,
+        };
+        check_storage_permissions(&proxy.storage, cancel)
+            .await
+            .unwrap();
+        (proxy, dir)
+    }
+
+    // see libs/utils/src/auth.rs
+    const TEST_PUB_KEY_ED25519: &[u8] = b"
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
+-----END PUBLIC KEY-----
+";
+
+    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
+-----END PRIVATE KEY-----
+"#;
+
+    async fn request(req: Request<Body>) -> Response<Body> {
+        let (proxy, _) = proxy().await;
+        app(Arc::new(proxy))
+            .into_service()
+            .oneshot(req)
+            .await
+            .unwrap()
+    }
+
+    #[testlog(tokio::test)]
+    async fn status() {
+        let res = Request::builder()
+            .uri("/status")
+            .body(Body::empty())
+            .map(request)
+            .unwrap()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+    }
+
+    fn routes() -> impl Iterator<Item = (&'static str, &'static str)> {
+        iproduct!(
+            vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"],
+            vec!["GET", "PUT", "DELETE"]
+        )
+    }
+
+    #[testlog(tokio::test)]
+    async fn no_token() {
+        for (uri, method) in routes() {
+            info!(%uri, %method);
+            let res = Request::builder()
+                .uri(uri)
+                .method(method)
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await;
+            assert!(matches!(
+                res.status(),
+                StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
+            ));
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn invalid_token() {
+        for (uri, method) in routes() {
+            info!(%uri, %method);
+            let status = Request::builder()
+                .uri(uri)
+                .header("Authorization", "Bearer 123")
+                .method(method)
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await;
+            assert!(matches!(
+                status.status(),
+                StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
+            ));
+        }
+    }
+
+    const TENANT_ID: TenantId =
+        TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
+    const TIMELINE_ID: TimelineId =
+        TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
+    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
+    fn token() -> String {
+        let claims = object_storage::Claims {
+            tenant_id: TENANT_ID,
+            timeline_id: TIMELINE_ID,
+            endpoint_id: ENDPOINT_ID.into(),
+            exp: u64::MAX,
+        };
+        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
+        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        jsonwebtoken::encode(&header, &claims, &key).unwrap()
+    }
+
+    #[testlog(tokio::test)]
+    async fn unauthorized() {
+        let (proxy, _) = proxy().await;
+        let mut app = app(Arc::new(proxy)).into_service();
+        let token = token();
+        let args = itertools::iproduct!(
+            vec![TENANT_ID.to_string(), TenantId::generate().to_string()],
+            vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
+            vec![ENDPOINT_ID, "ep-ololo"]
+        )
+        .skip(1);
+
+        for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
+            info!(%uri, %method, %tenant, %timeline, %endpoint);
+            let request = Request::builder()
+                .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key"))
+                .method(method)
+                .header("Authorization", format!("Bearer {}", token))
+                .body(Body::empty())
+                .unwrap();
+            let status = ServiceExt::ready(&mut app)
+                .await
+                .unwrap()
+                .call(request)
+                .await
+                .unwrap()
+                .status();
+            assert_eq!(status, StatusCode::UNAUTHORIZED);
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn method_not_allowed() {
+        let token = token();
+        let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]);
+        for (key, method) in iter {
+            let status = Request::builder()
+                .uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}"))
+                .method(method)
+                .header("Authorization", format!("Bearer {token}"))
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await
+                .status();
+            assert!(matches!(
+                status,
+                StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED
+            ));
+        }
+    }
+
+    async fn requests_chain(
+        chain: impl Iterator<Item = (String, &str, &'static str, StatusCode, bool)>,
+        token: impl Fn(&str) -> String,
+    ) {
+        let (proxy, _) = proxy().await;
+        let mut app = app(Arc::new(proxy)).into_service();
+        for (uri, method, body, expected_status, compare_body) in chain {
+            info!(%uri, %method, %body, %expected_status);
+            let bearer = format!("Bearer {}", token(&uri));
+            let request = Request::builder()
+                .uri(uri)
+                .method(method)
+                .header("Authorization", &bearer)
+                .body(Body::from(body))
+                .unwrap();
+            let response = ServiceExt::ready(&mut app)
+                .await
+                .unwrap()
+                .call(request)
+                .await
+                .unwrap();
+            assert_eq!(response.status(), expected_status);
+            if !compare_body {
+                continue;
+            }
+            let read_body = response.into_body().collect().await.unwrap().to_bytes();
+            assert_eq!(body, read_body);
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn metrics() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
+        let req = vec![
+            (uri.clone(), "PUT", "body", StatusCode::OK, false),
+            (uri.clone(), "DELETE", "", StatusCode::OK, false),
+        ];
+        requests_chain(req.into_iter(), |_| token()).await;
+
+        let res = Request::builder()
+            .uri("/metrics")
+            .body(Body::empty())
+            .map(request)
+            .unwrap()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+        let body = res.into_body().collect().await.unwrap().to_bytes();
+        let body = String::from_utf8_lossy(&body);
+        tracing::debug!(%body);
+        // Storage metrics are not gathered for LocalFs
+        if var(REAL_S3_ENV).is_ok() {
+            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
+        }
+        assert!(body.contains("process_threads"));
+    }
+
+    #[testlog(tokio::test)]
+    async fn insert_retrieve_remove() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
+        let chain = vec![
+            (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
+            (uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false),
+            (uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true),
+            (uri.clone(), "DELETE", "", StatusCode::OK, false),
+            (uri, "GET", "", StatusCode::NOT_FOUND, false),
+        ];
+        requests_chain(chain.into_iter(), |_| token()).await;
+    }
+
+    fn delete_prefix_token(uri: &str) -> String {
+        use serde::Serialize;
+        let parts = uri.split("/").collect::<Vec<&str>>();
+        #[derive(Serialize)]
+        struct PrefixClaims {
+            tenant_id: TenantId,
+            timeline_id: Option<TimelineId>,
+            endpoint_id: Option<object_storage::EndpointId>,
+            exp: u64,
+        }
+        let claims = PrefixClaims {
+            tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
+            timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
+            endpoint_id: parts.get(3).map(ToString::to_string),
+            exp: u64::MAX,
+        };
+        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
+        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        jsonwebtoken::encode(&header, &claims, &key).unwrap()
+    }
+
+    // Can't use single digit numbers as they won't be validated as TimelineId and EndpointId
+    #[testlog(tokio::test)]
+    async fn delete_prefix() {
+        let tenant_id =
+            TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string();
+        let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}");
+        // Why extra slash in string literals? Axum is weird with URIs:
+        // /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND
+        //  as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932
+        // The cost of removing trailing slash is suprisingly hard:
+        // * Add tower dependency with NormalizePath layer
+        // * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377
+        // * Rewrite make_service() -> into_make_service()
+        // * Rewrite oneshot() (not available for NormalizePath)
+        // I didn't manage to get it working correctly
+        let chain = vec![
+            // create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents
+            (f(t2, "/3/5"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false),
+            // create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
+            // create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/7/8"), "PUT", "", StatusCode::OK, false),
+            (f(t2, ""), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false),
+            // create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t3, "/8/9"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
+            // create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6
+            (f(t4, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t2, ""), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
+            (f(t4, "/5/6"), "GET", "", StatusCode::OK, false),
+            // delete prefix 1 -> empty
+            (format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+        ];
+        requests_chain(chain.into_iter(), delete_prefix_token).await;
+    }
+}
diff --git a/object_storage/src/lib.rs b/object_storage/src/lib.rs
new file mode 100644
index 0000000000..989afd4c25
--- /dev/null
+++ b/object_storage/src/lib.rs
@@ -0,0 +1,344 @@
+use anyhow::Result;
+use axum::extract::{FromRequestParts, Path};
+use axum::response::{IntoResponse, Response};
+use axum::{RequestPartsExt, http::StatusCode, http::request::Parts};
+use axum_extra::TypedHeader;
+use axum_extra::headers::{Authorization, authorization::Bearer};
+use camino::Utf8PathBuf;
+use jsonwebtoken::{DecodingKey, Validation};
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use serde::{Deserialize, Serialize};
+use std::fmt::Display;
+use std::result::Result as StdResult;
+use std::sync::Arc;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error};
+use utils::id::{TenantId, TimelineId};
+
+// simplified version of utils::auth::JwtAuth
+pub struct JwtAuth {
+    decoding_key: DecodingKey,
+    validation: Validation,
+}
+
+pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA;
+impl JwtAuth {
+    pub fn new(key: &[u8]) -> Result<Self> {
+        Ok(Self {
+            decoding_key: DecodingKey::from_ed_pem(key)?,
+            validation: Validation::new(VALIDATION_ALGO),
+        })
+    }
+
+    pub fn decode<T: serde::de::DeserializeOwned>(&self, token: &str) -> Result<T> {
+        Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?)
+    }
+}
+
+fn normalize_key(key: &str) -> StdResult<Utf8PathBuf, String> {
+    let key = clean_utf8(&Utf8PathBuf::from(key));
+    if key.starts_with("..") || key == "." || key == "/" {
+        return Err(format!("invalid key {key}"));
+    }
+    match key.strip_prefix("/").map(Utf8PathBuf::from) {
+        Ok(p) => Ok(p),
+        _ => Ok(key),
+    }
+}
+
+// Copied from path_clean crate with PathBuf->Utf8PathBuf
+fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf {
+    use camino::Utf8Component as Comp;
+    let mut out = Vec::new();
+    for comp in path.components() {
+        match comp {
+            Comp::CurDir => (),
+            Comp::ParentDir => match out.last() {
+                Some(Comp::RootDir) => (),
+                Some(Comp::Normal(_)) => {
+                    out.pop();
+                }
+                None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => {
+                    out.push(comp)
+                }
+            },
+            comp => out.push(comp),
+        }
+    }
+    if !out.is_empty() {
+        out.iter().collect()
+    } else {
+        Utf8PathBuf::from(".")
+    }
+}
+
+pub struct Storage {
+    pub auth: JwtAuth,
+    pub storage: GenericRemoteStorage,
+    pub cancel: CancellationToken,
+    pub max_upload_file_limit: usize,
+}
+
+pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
+
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct Claims {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub endpoint_id: EndpointId,
+    pub exp: u64,
+}
+
+impl Display for Claims {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
+            self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
+        )
+    }
+}
+
+#[derive(Deserialize, Serialize)]
+struct KeyRequest {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    endpoint_id: EndpointId,
+    path: String,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct S3Path {
+    pub path: RemotePath,
+}
+
+impl TryFrom<&KeyRequest> for S3Path {
+    type Error = String;
+    fn try_from(req: &KeyRequest) -> StdResult<Self, Self::Error> {
+        let KeyRequest {
+            tenant_id,
+            timeline_id,
+            endpoint_id,
+            path,
+        } = &req;
+        let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",);
+        let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?);
+        let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
+        Ok(S3Path { path })
+    }
+}
+
+fn unauthorized(route: impl Display, claims: impl Display) -> Response {
+    debug!(%route, %claims, "route doesn't match claims");
+    StatusCode::UNAUTHORIZED.into_response()
+}
+
+pub fn bad_request(err: impl Display, desc: &'static str) -> Response {
+    debug!(%err, desc);
+    (StatusCode::BAD_REQUEST, err.to_string()).into_response()
+}
+
+pub fn ok() -> Response {
+    StatusCode::OK.into_response()
+}
+
+pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response {
+    error!(%err, %path, desc);
+    StatusCode::INTERNAL_SERVER_ERROR.into_response()
+}
+
+pub fn not_found(key: impl ToString) -> Response {
+    (StatusCode::NOT_FOUND, key.to_string()).into_response()
+}
+
+impl FromRequestParts<Arc<Storage>> for S3Path {
+    type Rejection = Response;
+    async fn from_request_parts(
+        parts: &mut Parts,
+        state: &Arc<Storage>,
+    ) -> Result<Self, Self::Rejection> {
+        let Path(path): Path<KeyRequest> = parts
+            .extract()
+            .await
+            .map_err(|e| bad_request(e, "invalid route"))?;
+        let TypedHeader(Authorization(bearer)) = parts
+            .extract::<TypedHeader<Authorization<Bearer>>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        let claims: Claims = state
+            .auth
+            .decode(bearer.token())
+            .map_err(|e| bad_request(e, "decoding token"))?;
+        let route = Claims {
+            tenant_id: path.tenant_id,
+            timeline_id: path.timeline_id,
+            endpoint_id: path.endpoint_id.clone(),
+            exp: claims.exp,
+        };
+        if route != claims {
+            return Err(unauthorized(route, claims));
+        }
+        (&path)
+            .try_into()
+            .map_err(|e| bad_request(e, "invalid route"))
+    }
+}
+
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct PrefixKeyPath {
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub endpoint_id: Option<EndpointId>,
+}
+
+impl Display for PrefixKeyPath {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
+            self.tenant_id,
+            self.timeline_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string()),
+            self.endpoint_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string())
+        )
+    }
+}
+
+#[derive(Debug, PartialEq)]
+pub struct PrefixS3Path {
+    pub path: RemotePath,
+}
+
+impl From<&PrefixKeyPath> for PrefixS3Path {
+    fn from(path: &PrefixKeyPath) -> Self {
+        let timeline_id = path
+            .timeline_id
+            .as_ref()
+            .map(ToString::to_string)
+            .unwrap_or("".to_string());
+        let endpoint_id = path
+            .endpoint_id
+            .as_ref()
+            .map(ToString::to_string)
+            .unwrap_or("".to_string());
+        let path = Utf8PathBuf::from(path.tenant_id.to_string())
+            .join(timeline_id)
+            .join(endpoint_id);
+        let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
+        PrefixS3Path { path }
+    }
+}
+
+impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
+    type Rejection = Response;
+    async fn from_request_parts(
+        parts: &mut Parts,
+        state: &Arc<Storage>,
+    ) -> Result<Self, Self::Rejection> {
+        let Path(path) = parts
+            .extract::<Path<PrefixKeyPath>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid route"))?;
+        let TypedHeader(Authorization(bearer)) = parts
+            .extract::<TypedHeader<Authorization<Bearer>>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        let claims: PrefixKeyPath = state
+            .auth
+            .decode(bearer.token())
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        if path != claims {
+            return Err(unauthorized(path, claims));
+        }
+        Ok((&path).into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn normalize_key() {
+        let f = super::normalize_key;
+        assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello"));
+        assert_eq!(
+            f("ololo/1/../../not_ololo").unwrap(),
+            Utf8PathBuf::from("not_ololo")
+        );
+        assert!(f("ololo/1/../../../").is_err());
+        assert!(f(".").is_err());
+        assert!(f("../").is_err());
+        assert!(f("").is_err());
+        assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3"));
+        assert!(f("/1/2/3/../../../").is_err());
+        assert!(f("/1/2/3/../../../../").is_err());
+    }
+
+    const TENANT_ID: TenantId =
+        TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
+    const TIMELINE_ID: TimelineId =
+        TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
+    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
+
+    #[test]
+    fn s3_path() {
+        let auth = Claims {
+            tenant_id: TENANT_ID,
+            timeline_id: TIMELINE_ID,
+            endpoint_id: ENDPOINT_ID.into(),
+            exp: u64::MAX,
+        };
+        let s3_path = |key| {
+            let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}");
+            let path = RemotePath::from_string(path).unwrap();
+            S3Path { path }
+        };
+
+        let path = "cache_key".to_string();
+        let mut key_path = KeyRequest {
+            path,
+            tenant_id: auth.tenant_id,
+            timeline_id: auth.timeline_id,
+            endpoint_id: auth.endpoint_id,
+        };
+        assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
+
+        key_path.path = "we/can/have/nested/paths".to_string();
+        assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
+
+        key_path.path = "../error/hello/../".to_string();
+        assert!(S3Path::try_from(&key_path).is_err());
+    }
+
+    #[test]
+    fn prefix_s3_path() {
+        let mut path = PrefixKeyPath {
+            tenant_id: TENANT_ID,
+            timeline_id: None,
+            endpoint_id: None,
+        };
+        let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}"))
+        );
+
+        path.timeline_id = Some(TIMELINE_ID);
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}"))
+        );
+
+        path.endpoint_id = Some(ENDPOINT_ID.into());
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}"))
+        );
+    }
+}
diff --git a/object_storage/src/main.rs b/object_storage/src/main.rs
new file mode 100644
index 0000000000..40325db19d
--- /dev/null
+++ b/object_storage/src/main.rs
@@ -0,0 +1,65 @@
+//! `object_storage` is a service which provides API for uploading and downloading
+//! files. It is used by compute and control plane for accessing LFC prewarm data.
+//! This service is deployed either as a separate component or as part of compute image
+//! for large computes.
+mod app;
+use anyhow::Context;
+use tracing::info;
+use utils::logging;
+
+//see set()
+const fn max_upload_file_limit() -> usize {
+    100 * 1024 * 1024
+}
+
+#[derive(serde::Deserialize)]
+#[serde(tag = "type")]
+struct Config {
+    listen: std::net::SocketAddr,
+    pemfile: camino::Utf8PathBuf,
+    #[serde(flatten)]
+    storage_config: remote_storage::RemoteStorageConfig,
+    #[serde(default = "max_upload_file_limit")]
+    max_upload_file_limit: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    logging::init(
+        logging::LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        logging::Output::Stdout,
+    )?;
+
+    let config: String = std::env::args().skip(1).take(1).collect();
+    if config.is_empty() {
+        anyhow::bail!("Usage: object_storage config.json")
+    }
+    info!("Reading config from {config}");
+    let config = std::fs::read_to_string(config.clone())?;
+    let config: Config = serde_json::from_str(&config).context("parsing config")?;
+    info!("Reading pemfile from {}", config.pemfile.clone());
+    let pemfile = std::fs::read(config.pemfile.clone())?;
+    info!("Loading public key from {}", config.pemfile.clone());
+    let auth = object_storage::JwtAuth::new(&pemfile)?;
+
+    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
+    info!("listening on {}", listener.local_addr().unwrap());
+
+    let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
+    let cancel = tokio_util::sync::CancellationToken::new();
+    app::check_storage_permissions(&storage, cancel.clone()).await?;
+
+    let proxy = std::sync::Arc::new(object_storage::Storage {
+        auth,
+        storage,
+        cancel: cancel.clone(),
+        max_upload_file_limit: config.max_upload_file_limit,
+    });
+
+    tokio::spawn(utils::signals::signal_handler(cancel.clone()));
+    axum::serve(listener, app::app(proxy))
+        .with_graceful_shutdown(async move { cancel.cancelled().await })
+        .await?;
+    Ok(())
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 9a8494292d..54fecee588 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,7 +31,6 @@ use pageserver::{
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
-use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -744,32 +743,7 @@ fn start_pageserver(
         let signal_token = CancellationToken::new();
         let signal_cancel = signal_token.child_token();
 
-        // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
-        // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
-        // https://github.com/neondatabase/neon/issues/9740.
-        tokio::spawn(async move {
-            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-
-            loop {
-                let signal = tokio::select! {
-                    _ = sigquit.recv() => {
-                        info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
-                        std::process::exit(111);
-                    }
-                    _ = sigint.recv() => "SIGINT",
-                    _ = sigterm.recv() => "SIGTERM",
-                };
-
-                if !signal_token.is_cancelled() {
-                    info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
-                    signal_token.cancel();
-                } else {
-                    info!("Got signal {signal}. Already shutting down.");
-                }
-            }
-        });
+        tokio::spawn(utils::signals::signal_handler(signal_token));
 
         // Wait for cancellation signal and shut down the pageserver.
         //
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index d555ee2989..5f5626fb98 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -417,6 +417,19 @@ class NeonLocalCli(AbstractNeonCli):
             cmd.append(f"--instance-id={instance_id}")
         return self.raw_cli(cmd)
 
+    def object_storage_start(self, timeout_in_seconds: int | None = None):
+        cmd = ["object-storage", "start"]
+        if timeout_in_seconds is not None:
+            cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        return self.raw_cli(cmd)
+
+    def object_storage_stop(self, immediate: bool):
+        cmd = ["object-storage", "stop"]
+        if immediate:
+            cmd.extend(["-m", "immediate"])
+        return self.raw_cli(cmd)
+        pass
+
     def pageserver_start(
         self,
         id: int,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5694bf170e..d000dcb69f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1023,6 +1023,8 @@ class NeonEnvBuilder:
 
             self.env.broker.assert_no_errors()
 
+            self.env.object_storage.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -1118,6 +1120,8 @@ class NeonEnv:
             pagectl_env_vars["RUST_LOG"] = self.rust_log_override
         self.pagectl = Pagectl(extra_env=pagectl_env_vars, binpath=self.neon_binpath)
 
+        self.object_storage = ObjectStorage(self)
+
         # The URL for the pageserver to use as its control_plane_api config
         if config.storage_controller_port_override is not None:
             log.info(
@@ -1173,6 +1177,7 @@ class NeonEnv:
             },
             "safekeepers": [],
             "pageservers": [],
+            "object_storage": {"port": self.port_distributor.get_port()},
             "generate_local_ssl_certs": self.generate_local_ssl_certs,
         }
 
@@ -1408,6 +1413,8 @@ class NeonEnv:
                 self.storage_controller.on_safekeeper_deploy(sk_id, body)
                 self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active")
 
+        self.object_storage.start(timeout_in_seconds=timeout_in_seconds)
+
     def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
         """
         After this method returns, there should be no child processes running.
@@ -1425,6 +1432,8 @@ class NeonEnv:
         except Exception as e:
             raise_later = e
 
+        self.object_storage.stop(immediate=immediate)
+
         # Stop storage controller before pageservers: we don't want it to spuriously
         # detect a pageserver "failure" during test teardown
         self.storage_controller.stop(immediate=immediate)
@@ -2635,6 +2644,26 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.stop(immediate=True)
 
 
+class ObjectStorage(LogUtils):
+    def __init__(self, env: NeonEnv):
+        service_dir = env.repo_dir / "object_storage"
+        super().__init__(logfile=service_dir / "object_storage.log")
+        self.conf_path = service_dir / "object_storage.json"
+        self.env = env
+
+    def base_url(self):
+        return json.loads(self.conf_path.read_text())["listen"]
+
+    def start(self, timeout_in_seconds: int | None = None):
+        self.env.neon_cli.object_storage_start(timeout_in_seconds)
+
+    def stop(self, immediate: bool = False):
+        self.env.neon_cli.object_storage_stop(immediate)
+
+    def assert_no_errors(self):
+        assert_no_errors(self.logfile, "object_storage", [])
+
+
 class NeonProxiedStorageController(NeonStorageController):
     def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool, use_https: bool):
         super().__init__(env, proxy_port, auth_enabled, use_https)
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index 8bd0662ef8..e6bcdf8e67 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,10 +134,11 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     """
     env = neon_env_builder.init_start()
 
-    # Stop default ps/sk
+    # Stop default services
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.object_storage_stop(False)
     env.neon_cli.storage_broker_stop()
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
@@ -179,11 +180,13 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
 
     # Using the single-pageserver shortcut property throws when there are multiple pageservers
     with pytest.raises(AssertionError):
-        _drop = env.pageserver
+        _ = env.pageserver
 
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1)
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
 
+    env.neon_cli.object_storage_stop(False)
+
     # Stop this to get out of the way of the following `start`
     env.neon_cli.storage_controller_stop(False)
     env.neon_cli.storage_broker_stop()
diff --git a/test_runner/regress/test_object_storage.py b/test_runner/regress/test_object_storage.py
new file mode 100644
index 0000000000..0b1cfa344f
--- /dev/null
+++ b/test_runner/regress/test_object_storage.py
@@ -0,0 +1,56 @@
+from time import time
+
+import pytest
+from aiohttp import ClientSession
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from jwcrypto import jwk, jwt
+
+
+@pytest.mark.asyncio
+async def test_object_storage_insert_retrieve_delete(neon_simple_env: NeonEnv):
+    """
+    Inserts, retrieves, and deletes test file using a JWT token
+    """
+    env = neon_simple_env
+    ep = env.endpoints.create_start(branch_name="main")
+    tenant_id = str(ep.tenant_id)
+    timeline_id = str(ep.show_timeline_id())
+    endpoint_id = ep.endpoint_id
+
+    key_path = env.repo_dir / "auth_private_key.pem"
+    key = jwk.JWK.from_pem(key_path.read_bytes())
+    claims = {
+        "tenant_id": tenant_id,
+        "timeline_id": timeline_id,
+        "endpoint_id": endpoint_id,
+        "exp": round(time()) + 99,
+    }
+    log.info(f"key path {key_path}\nclaims {claims}")
+    token = jwt.JWT(header={"alg": "EdDSA"}, claims=claims)
+    token.make_signed_token(key)
+    token = token.serialize()
+
+    base_url = env.object_storage.base_url()
+    key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key"
+    headers = {"Authorization": f"Bearer {token}"}
+    log.info(f"cache key url {key}")
+    log.info(f"token {token}")
+
+    async with ClientSession(headers=headers) as session:
+        async with session.get(key) as res:
+            assert res.status == 404, f"Non-existing file is present: {res}"
+
+        data = b"cheburash"
+        async with session.put(key, data=data) as res:
+            assert res.status == 200, f"Error writing file: {res}"
+
+        async with session.get(key) as res:
+            read_data = await res.read()
+            assert data == read_data
+
+        async with session.delete(key) as res:
+            assert res.status == 200, f"Error removing file {res}"
+
+        async with session.get(key) as res:
+            assert res.status == 404, f"File was not deleted: {res}"
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 702f4eeccf..0175794a57 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -95,6 +95,7 @@ def test_storage_controller_smoke(
     env.pageservers[1].start()
     for sk in env.safekeepers:
         sk.start()
+    env.object_storage.start()
 
     # The pageservers we started should have registered with the sharding service on startup
     nodes = env.storage_controller.node_list()
@@ -346,6 +347,7 @@ def prepare_onboarding_env(
     env = neon_env_builder.init_configs()
     env.broker.start()
     env.storage_controller.start()
+    env.object_storage.start()
 
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
     # mode so that it doesn't talk to storage controller, and do not register it.

From a09c933de3de7a57b23f814e013dfa0fbcc1356a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 8 Apr 2025 12:08:44 -0400
Subject: [PATCH 076/140] test(pageserver): add conditional append test record
 (#11476)

## Problem

For future gc-compaction tests when we support
https://github.com/neondatabase/neon/issues/10395

## Summary of changes

Add a new type of neon test WAL record that is conditionally applied
(i.e., only when image == the specified value). We can use this to mock
the situation where we lose some records in the middle, firing an error,
and see how gc-compaction reacts to it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/record.rs    | 15 ++++++++++++++
 pageserver/src/tenant.rs             | 29 ++++++++++++++++++++++++++--
 pageserver/src/walredo/apply_neon.rs |  8 ++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs
index fda504a26e..73516c5220 100644
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -58,6 +58,8 @@ pub enum NeonWalRecord {
         /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
         /// its references in `timeline.rs`.
         will_init: bool,
+        /// Only append the record if the current image is the same as the one specified in this field.
+        only_if: Option<String>,
     },
 }
 
@@ -81,6 +83,17 @@ impl NeonWalRecord {
             append: s.as_ref().to_string(),
             clear: false,
             will_init: false,
+            only_if: None,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_append_conditional(s: impl AsRef<str>, only_if: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+            only_if: Some(only_if.as_ref().to_string()),
         }
     }
 
@@ -90,6 +103,7 @@ impl NeonWalRecord {
             append: s.as_ref().to_string(),
             clear: true,
             will_init: false,
+            only_if: None,
         }
     }
 
@@ -99,6 +113,7 @@ impl NeonWalRecord {
             append: s.as_ref().to_string(),
             clear: true,
             will_init: true,
+            only_if: None,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0c399d4c91..1bfc51d5c8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -8733,6 +8733,21 @@ mod tests {
                 Lsn(0x20),
                 Value::WalRecord(NeonWalRecord::wal_init("i")),
             ),
+            (
+                get_key(4),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_init("1")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")),
+            ),
         ];
         let image1 = vec![(get_key(1), "0x10".into())];
 
@@ -8763,8 +8778,18 @@ mod tests {
 
         // Need to remove the limit of "Neon WAL redo requires base image".
 
-        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
-        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
+        assert_eq!(
+            tline.get(get_key(3), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"c")
+        );
+        assert_eq!(
+            tline.get(get_key(4), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"ij")
+        );
+
+        // Manual testing required: currently, read errors will panic the process in debug mode. So we
+        // cannot enable this assertion in the unit test.
+        // assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err());
 
         Ok(())
     }
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 61ae1eb970..a3840f1f6f 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -276,6 +276,7 @@ pub(crate) fn apply_in_neon(
             append,
             clear,
             will_init,
+            only_if,
         } => {
             use bytes::BufMut;
             if *will_init {
@@ -288,6 +289,13 @@ pub(crate) fn apply_in_neon(
             if *clear {
                 page.clear();
             }
+            if let Some(only_if) = only_if {
+                if page != only_if.as_bytes() {
+                    return Err(anyhow::anyhow!(
+                        "the current image does not match the expected image, cannot append"
+                    ));
+                }
+            }
             page.put_slice(append.as_bytes());
         }
     }

From d177654e5f310da5d7068464663c4dbac5a2f276 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 8 Apr 2025 18:57:10 +0200
Subject: [PATCH 077/140] gitignore: add `/artifact_cache` (#11493)

## Problem

This is generated e.g. by `test_historic_storage_formats`, and causes
VSCode to list all the contained files as new.

## Summary of changes

Add `/artifact_cache` to `.gitignore`.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a07a65ccef..45eb4dbf0e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+/artifact_cache
 /pg_install
 /target
 /tmp_check

From 7679b63a2c0d8672988aa4f29e741d9ec12de600 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 8 Apr 2025 22:43:27 +0200
Subject: [PATCH 078/140] pageserver: persist stripe size in tenant manifest
 for tenant_import (#11181)

## Problem

`tenant_import`, used to import an existing tenant from remote storage
into a storage controller for support and debugging, assumed
`DEFAULT_STRIPE_SIZE` since this can't be recovered from remote storage.
In #11168, we are changing the stripe size, which will break
`tenant_import`.

Resolves #11175.

## Summary of changes

* Add `stripe_size` to the tenant manifest.
* Add `TenantScanRemoteStorageShard::stripe_size` and return from
`tenant_scan_remote` if present.
* Recover the stripe size during`tenant_import`, or fall back to 32768
(the original default stripe size).
* Add tenant manifest compatibility snapshot:
`2025-04-08-pgv17-tenant-manifest-v1.tar.zst`

There are no cross-version concerns here, since unknown fields are
ignored during deserialization where relevant.
---
 libs/pageserver_api/src/models.rs             |  1 +
 libs/pageserver_api/src/shard.rs              |  6 ++
 pageserver/src/http/routes.rs                 | 15 ++++-
 pageserver/src/tenant.rs                      |  1 +
 .../tenant/remote_timeline_client/manifest.rs | 63 ++++++++++++++++++-
 storage_controller/src/service.rs             | 18 +++++-
 test_runner/regress/test_compatibility.py     |  7 +++
 7 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 2ffff67688..8186889e10 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1680,6 +1680,7 @@ pub struct SecondaryProgress {
 pub struct TenantScanRemoteStorageShard {
     pub tenant_shard_id: TenantShardId,
     pub generation: Option<u32>,
+    pub stripe_size: Option<ShardStripeSize>,
 }
 
 #[derive(Serialize, Deserialize, Debug, Default)]
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 8386d6e586..abbf4e6432 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -78,6 +78,12 @@ impl Default for ShardStripeSize {
     }
 }
 
+impl std::fmt::Display for ShardStripeSize {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cf67dc596a..bce590016e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -67,7 +67,7 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::{
-    download_index_part, list_remote_tenant_shards, list_remote_timelines,
+    download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines,
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
@@ -2911,9 +2911,22 @@ async fn tenant_scan_remote_handler(
             };
         }
 
+        let result =
+            download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel)
+                .instrument(info_span!("download_tenant_manifest",
+                            tenant_id=%tenant_shard_id.tenant_id,
+                            shard_id=%tenant_shard_id.shard_slug()))
+                .await;
+        let stripe_size = match result {
+            Ok((manifest, _, _)) => manifest.stripe_size,
+            Err(DownloadError::NotFound) => None,
+            Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))),
+        };
+
         response.shards.push(TenantScanRemoteStorageShard {
             tenant_shard_id,
             generation: generation.into(),
+            stripe_size,
         });
     }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1bfc51d5c8..900e98d7e9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4079,6 +4079,7 @@ impl Tenant {
 
         TenantManifest {
             version: LATEST_TENANT_MANIFEST_VERSION,
+            stripe_size: Some(self.get_shard_stripe_size()),
             offloaded_timelines,
         }
     }
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index 0e07acfbc8..7dba4508e2 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -1,4 +1,5 @@
 use chrono::NaiveDateTime;
+use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -14,6 +15,12 @@ pub struct TenantManifest {
     /// allow release rollbacks.
     pub version: usize,
 
+    /// This tenant's stripe size. This is only advisory, and used to recover tenant data from
+    /// remote storage. The autoritative source is the storage controller. If None, assume the
+    /// original default value of 32768 blocks (256 MB).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stripe_size: Option<ShardStripeSize>,
+
     /// The list of offloaded timelines together with enough information
     /// to not have to actually load them.
     ///
@@ -42,7 +49,12 @@ pub struct OffloadedTimelineManifest {
 
 /// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
 /// do not use deny_unknown_fields, so new fields are not breaking.
-pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
+///
+/// 1: initial version
+/// 2: +stripe_size
+///
+/// When adding new versions, also add a parse_vX test case below.
+pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2;
 
 impl TenantManifest {
     /// Returns true if the manifests are equal, ignoring the version number. This avoids
@@ -56,10 +68,11 @@ impl TenantManifest {
         // We could alternatively just clone and modify the version here.
         let Self {
             version: _, // ignore version
+            stripe_size,
             offloaded_timelines,
         } = self;
 
-        offloaded_timelines == &other.offloaded_timelines
+        stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines
     }
 
     /// Decodes a manifest from JSON.
@@ -89,6 +102,7 @@ mod tests {
          }"#;
         let expected = TenantManifest {
             version: 0,
+            stripe_size: None,
             offloaded_timelines: Vec::new(),
         };
         assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -104,6 +118,7 @@ mod tests {
          }"#;
         let expected = TenantManifest {
             version: 1,
+            stripe_size: None,
             offloaded_timelines: Vec::new(),
         };
         assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -130,6 +145,50 @@ mod tests {
          }"#;
         let expected = TenantManifest {
             version: 1,
+            stripe_size: None,
+            offloaded_timelines: vec![
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
+                    ancestor_timeline_id: None,
+                    ancestor_retain_lsn: None,
+                    archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
+                },
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
+                    ancestor_timeline_id: Some(TimelineId::from_str(
+                        "5c4df612fd159e63c1b7853fe94d97da",
+                    )?),
+                    ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
+                    archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
+                },
+            ],
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+
+    /// v2 manifests should be parsed, for backwards compatibility.
+    #[test]
+    fn parse_v2() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 2,
+             "stripe_size": 32768,
+             "offloaded_timelines": [
+                 {
+                     "timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "archived_at": "2025-03-07T11:07:11.373105434"
+                 },
+                 {
+                     "timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
+                     "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "ancestor_retain_lsn": "0/1F79038",
+                     "archived_at": "2025-03-05T11:10:22.257901390"
+                 }
+             ]
+         }"#;
+        let expected = TenantManifest {
+            version: 2,
+            stripe_size: Some(ShardStripeSize(32768)),
             offloaded_timelines: vec![
                 OffloadedTimelineManifest {
                     timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 5e53051727..e4db58cc84 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6014,9 +6014,21 @@ impl Service {
             .max()
             .expect("We already validated >0 shards");
 
-        // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
-        // only work if they were using the default stripe size.
-        let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
+        // Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so
+        // fall back to the original default stripe size of 32768 (256 MB) if it's not specified.
+        const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768);
+        let stripe_size = scan_result
+            .shards
+            .iter()
+            .find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation)
+            .expect("we validated >0 shards above")
+            .stripe_size
+            .unwrap_or_else(|| {
+                if shard_count.count() > 1 {
+                    warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}");
+                }
+                ORIGINAL_STRIPE_SIZE
+            });
 
         let (response, waiters) = self
             .do_tenant_create(TenantCreateRequest {
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ee96daca33..2230bdc666 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -492,6 +492,13 @@ HISTORIC_DATA_SETS = [
         PgVersion.V17,
         "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst",
     ),
+    # Tenant manifest v1.
+    HistoricDataSet(
+        "2025-04-08-tenant-manifest-v1",
+        TenantId("c547c28588abf1d7b7139ff1f1158345"),
+        PgVersion.V17,
+        "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-04-08-pgv17-tenant-manifest-v1.tar.zst",
+    ),
 ]
 
 
From c9ca8b7c4a91692b78acd18a8bda422f920a174f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 9 Apr 2025 08:14:29 +0300
Subject: [PATCH 079/140] One more fix for unlogged build support in
 DEBUG_COMPARE_LOCAL (#11474)

## Problem

Support of unlogged build in DEBUG_COMPARE_LOCAL.
Neon SMGR treats present of local file as indicator of unlogged
relations.
But it doesn't work in  DEBUG_COMPARE_LOCAL mode.

## Summary of changes

Use INIT_FORKNUM as indicator of unlogged file and create this file
while unlogged index build.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 6d58f4f28f..0a43f3a6a3 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3565,7 +3565,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		for (int i = 0; i < nblocks; i++)
 		{
 			BlockNumber blkno = blocknum + i;
-			if (!BITMAP_ISSET(read, i))
+			if (!BITMAP_ISSET(read_pages, i))
 				continue;
 
 #if PG_MAJORVERSION_NUM >= 17
@@ -3688,6 +3688,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 #ifndef DEBUG_COMPARE_LOCAL
 			/* This is a bit tricky. Check if the relation exists locally */
 			if (mdexists(reln, forknum))
+#else
+			if (mdexists(reln, INIT_FORKNUM))
+#endif
 			{
 				/* It exists locally. Guess it's unlogged then. */
 #if PG_MAJORVERSION_NUM >= 17
@@ -3704,7 +3707,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 				 */
 				return;
 			}
-#endif
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
@@ -3761,6 +3763,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 #ifndef DEBUG_COMPARE_LOCAL
 			/* This is a bit tricky. Check if the relation exists locally */
 			if (mdexists(reln, forknum))
+#else
+			if (mdexists(reln, INIT_FORKNUM))
+#endif
 			{
 				/* It exists locally. Guess it's unlogged then. */
 				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
@@ -3774,7 +3779,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 				 */
 				return;
 			}
-#endif
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
@@ -4188,6 +4192,8 @@ neon_start_unlogged_build(SMgrRelation reln)
 #ifndef DEBUG_COMPARE_LOCAL
  	if (!IsParallelWorker())
 		mdcreate(reln, MAIN_FORKNUM, false);
+#else
+	mdcreate(reln, INIT_FORKNUM, false);
 #endif
 }
 
@@ -4266,6 +4272,8 @@ neon_end_unlogged_build(SMgrRelation reln)
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */
 			mdunlink(rinfob, forknum, true);
+#else
+			mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 		}
 	}

From c610f3584df1fa7abe1a18d33bba7e647e33d29c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Apr 2025 08:52:49 +0200
Subject: [PATCH 080/140] test_runner: tweak `test_create_snapshot` compaction
 (#11495)

## Problem

With the recent improvements to L0 compaction responsiveness,
`test_create_snapshot` now ends up generating 10,000 layer files
(compared to 1,000 in previous snapshots). This increases the snapshot
size by 4x, and significantly slows down tests.

## Summary of changes

Increase the target layer size from 128 KB to 256 KB, and the L0
compaction threshold from 1 to 5. This reduces the layer count from
about 10,000 to 1,000.
---
 test_runner/regress/test_compatibility.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 2230bdc666..e23b1e0bca 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -148,9 +148,9 @@ def test_create_snapshot(
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             # Miniature layers to enable generating non-trivial layer map without writing lots of data.
-            "checkpoint_distance": f"{128 * 1024}",
-            "compaction_threshold": "1",
-            "compaction_target_size": f"{128 * 1024}",
+            "checkpoint_distance": f"{256 * 1024}",
+            "compaction_threshold": "5",
+            "compaction_target_size": f"{256 * 1024}",
         }
     )
     endpoint = env.endpoints.create_start("main")

From cf62017a5b38a417394a9120c8db202127135795 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Wed, 9 Apr 2025 12:33:49 +0400
Subject: [PATCH 081/140] storcon: add https metrics for
 pageservers/safekeepers (#11460)

## Problem
Storcon will not start up if `use_https` is on and there are some
pageservers or safekeepers without https port in the database. Metrics
"how many nodes with https we have in DB" will help us to make sure that
`use_https` may be turned on safely.
- Part of https://github.com/neondatabase/cloud/issues/25526

## Summary of changes
- Add `storage_controller_https_pageserver_nodes`,
`storage_controller_safekeeper_nodes` and
`storage_controller_https_safekeeper_nodes` Prometheus metrics.
---
 storage_controller/src/metrics.rs             |  9 +++++++
 storage_controller/src/node.rs                |  4 ++++
 storage_controller/src/safekeeper.rs          |  3 +++
 storage_controller/src/service.rs             | 24 +++++++++++++++++++
 .../src/service/safekeeper_service.rs         | 15 ++++++++++++
 5 files changed, 55 insertions(+)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index ea390df726..5ce2fb65e4 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Size of the in-memory map of pageserver_nodes
     pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
 
+    /// Count of how many pageserver nodes from in-memory map have https configured
+    pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
+
+    /// Size of the in-memory map of safekeeper_nodes
+    pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
+
+    /// Count of how many safekeeper nodes from in-memory map have https configured
+    pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
+
     /// Reconciler tasks completed, broken down by success/failure/cancelled
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index f667514517..e180c49b43 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -89,6 +89,10 @@ impl Node {
         self.scheduling = scheduling
     }
 
+    pub(crate) fn has_https_port(&self) -> bool {
+        self.listen_https_port.is_some()
+    }
+
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
index 3b731acf7e..5a13ef750e 100644
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -89,6 +89,9 @@ impl Safekeeper {
     pub(crate) fn availability(&self) -> SafekeeperState {
         self.availability.clone()
     }
+    pub(crate) fn has_https_port(&self) -> bool {
+        self.listen_https_port.is_some()
+    }
     /// Perform an operation (which is given a [`SafekeeperClient`]) with retries
     #[allow(clippy::too_many_arguments)]
     pub(crate) async fn with_client_retries<T, O, F>(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e4db58cc84..c1c2e2c189 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1509,6 +1509,10 @@ impl Service {
             .metrics_group
             .storage_controller_pageserver_nodes
             .set(nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
 
         tracing::info!("Loading safekeepers from database...");
         let safekeepers = persistence
@@ -1526,6 +1530,14 @@ impl Service {
         let safekeepers: HashMap<NodeId, Safekeeper> =
             safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_safekeeper_nodes
+            .set(safekeepers.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_safekeeper_nodes
+            .set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64);
 
         tracing::info!("Loading shards from database...");
         let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
@@ -6254,6 +6266,10 @@ impl Service {
             .metrics_group
             .storage_controller_pageserver_nodes
             .set(locked.nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
 
         locked.scheduler.node_remove(node_id);
 
@@ -6345,6 +6361,10 @@ impl Service {
                     .metrics_group
                     .storage_controller_pageserver_nodes
                     .set(nodes.len() as i64);
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_https_pageserver_nodes
+                    .set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
             }
         }
 
@@ -6569,6 +6589,10 @@ impl Service {
             .metrics_group
             .storage_controller_pageserver_nodes
             .set(locked.nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
 
         match registration_status {
             RegistrationStatus::New => {
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 7f2c63b9af..099d0305ba 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -5,6 +5,7 @@ use std::time::Duration;
 
 use super::safekeeper_reconciler::ScheduleRequest;
 use crate::heartbeater::SafekeeperState;
+use crate::metrics;
 use crate::persistence::{
     DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
 };
@@ -590,6 +591,20 @@ impl Service {
                 }
             }
             locked.safekeepers = Arc::new(safekeepers);
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_safekeeper_nodes
+                .set(locked.safekeepers.len() as i64);
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_https_safekeeper_nodes
+                .set(
+                    locked
+                        .safekeepers
+                        .values()
+                        .filter(|s| s.has_https_port())
+                        .count() as i64,
+                );
         }
         Ok(())
     }

From a6ff8ec3d47963616d9cef07421d9319db958e8a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Apr 2025 10:41:38 +0200
Subject: [PATCH 082/140] storcon: change default stripe size to 16 MB (#11168)

## Problem

The current stripe size of 256 MB is a bit large, and can cause load
imbalances across shards. A stripe size of 16 MB appears more reasonable
to avoid hotspots, although we don't see evidence of this in benchmarks.

Resolves https://github.com/neondatabase/cloud/issues/25634.
Touches https://github.com/neondatabase/cloud/issues/21870.

## Summary of changes

* Change the default stripe size to 16 MB.
* Remove `ShardParameters::DEFAULT_STRIPE_SIZE`, and only use
`pageserver_api::shard::DEFAULT_STRIPE_SIZE`.
* Update a bunch of tests that assumed a certain stripe size.
---
 control_plane/src/bin/neon_local.rs           |   6 +-
 docs/storage_controller.md                    |   2 +-
 libs/pageserver_api/src/keyspace.rs           | 133 ++++++++++--------
 libs/pageserver_api/src/models.rs             |   6 +-
 libs/pageserver_api/src/shard.rs              |   9 +-
 storage_controller/src/compute_hook.rs        |   9 +-
 storage_controller/src/service.rs             |   6 +-
 storage_controller/src/tenant_shard.rs        |  18 +--
 .../regress/test_storage_controller.py        |   2 +-
 9 files changed, 98 insertions(+), 93 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 99f0d374c1..db9715dc62 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -41,7 +41,7 @@ use pageserver_api::controller_api::{
 use pageserver_api::models::{
     ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
 };
-use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::membership::SafekeeperGeneration;
@@ -1117,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
                         stripe_size: args
                             .shard_stripe_size
                             .map(ShardStripeSize)
-                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
+                            .unwrap_or(DEFAULT_STRIPE_SIZE),
                     },
                     placement_policy: args.placement_policy.clone(),
                     config: tenant_conf,
@@ -1430,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                     vec![(parsed.0, parsed.1.unwrap_or(5432))],
                     // If caller is telling us what pageserver to use, this is not a tenant which is
                     // full managed by storage controller, therefore not sharded.
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                    DEFAULT_STRIPE_SIZE,
                 )
             } else {
                 // Look up the currently attached location of the tenant, and its striping metadata,
diff --git a/docs/storage_controller.md b/docs/storage_controller.md
index ac4aca4219..d761210033 100644
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -151,7 +151,7 @@ Example body:
 ```
 {
   "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
-  "stripe_size": 32768,
+  "stripe_size": 2048,
   "shards": [
       {"node_id": 344, "shard_number": 0},
       {"node_id": 722, "shard_number": 1},
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index e505f23e49..79e3ef553b 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -613,8 +613,7 @@ mod tests {
     use rand::{RngCore, SeedableRng};
 
     use super::*;
-    use crate::models::ShardParameters;
-    use crate::shard::{ShardCount, ShardNumber};
+    use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize};
 
     // Helper function to create a key range.
     //
@@ -964,12 +963,8 @@ mod tests {
     }
     #[test]
     fn sharded_range_relation_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
 
         let range = ShardedRange::new(
             Range {
@@ -985,12 +980,8 @@ mod tests {
 
     #[test]
     fn shard_identity_keyspaces_single_key() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
 
         let range = ShardedRange::new(
             Range {
@@ -1034,12 +1025,8 @@ mod tests {
 
     #[test]
     fn shard_identity_keyspaces_forkno_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
 
         let range = ShardedRange::new(
             Range {
@@ -1061,7 +1048,7 @@ mod tests {
             let shard_identity = ShardIdentity::new(
                 ShardNumber(shard_number),
                 ShardCount::new(4),
-                ShardParameters::DEFAULT_STRIPE_SIZE,
+                DEFAULT_STRIPE_SIZE,
             )
             .unwrap();
 
@@ -1144,37 +1131,44 @@ mod tests {
     /// for a single tenant.
     #[test]
     fn sharded_range_fragment_simple() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+
         let shard_identity = ShardIdentity::new(
             ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
 
         // A range which we happen to know covers exactly one stripe which belongs to this shard
         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
+        let mut input_end = input_start;
+        input_end.field6 += STRIPE_SIZE; // field6 is block number
 
         // Ask for stripe_size blocks, we get the whole stripe
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 32768),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
         );
 
         // Ask for more, we still get the whole stripe
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 10000000),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
         );
 
         // Ask for target_nblocks of half the stripe size, we get two halves
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16384),
+            do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2),
             (
-                32768,
+                STRIPE_SIZE,
                 vec![
-                    (16384, input_start..input_start.add(16384)),
-                    (16384, input_start.add(16384)..input_end)
+                    (
+                        STRIPE_SIZE / 2,
+                        input_start..input_start.add(STRIPE_SIZE / 2)
+                    ),
+                    (STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end)
                 ]
             )
         );
@@ -1182,40 +1176,53 @@ mod tests {
 
     #[test]
     fn sharded_range_fragment_multi_stripe() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+        const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
+
         let shard_identity = ShardIdentity::new(
             ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
 
         // A range which covers multiple stripes, exactly one of which belongs to the current shard.
         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let mut input_end = input_start;
+        input_end.field6 += RANGE_SIZE; // field6 is block number
+
         // Ask for all the blocks, get a fragment that covers the whole range but reports
         // its size to be just the blocks belonging to our shard.
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 131072),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
         );
 
-        // Ask for a sub-stripe quantity
+        // Ask for a sub-stripe quantity that results in 3 fragments.
+        let limit = STRIPE_SIZE / 3 + 1;
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16000),
+            do_fragment(input_start, input_end, &shard_identity, limit),
             (
-                32768,
+                STRIPE_SIZE,
                 vec![
-                    (16000, input_start..input_start.add(16000)),
-                    (16000, input_start.add(16000)..input_start.add(32000)),
-                    (768, input_start.add(32000)..input_end),
+                    (limit, input_start..input_start.add(limit)),
+                    (limit, input_start.add(limit)..input_start.add(2 * limit)),
+                    (
+                        STRIPE_SIZE - 2 * limit,
+                        input_start.add(2 * limit)..input_end
+                    ),
                 ]
             )
         );
 
         // Try on a range that starts slightly after our owned stripe
         assert_eq!(
-            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
-            (32767, vec![(32767, input_start.add(1)..input_end)])
+            do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE),
+            (
+                STRIPE_SIZE - 1,
+                vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)]
+            )
         );
     }
 
@@ -1223,32 +1230,40 @@ mod tests {
     /// a previous relation.
     #[test]
     fn sharded_range_fragment_starting_from_logical_size() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+        const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
+
         let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
+        let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap();
+        input_end.field6 += RANGE_SIZE; // field6 is block number
 
         // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
         let shard_identity = ShardIdentity::new(
             ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x8001, vec![(0x8001, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
+            (
+                STRIPE_SIZE + 1,
+                vec![(STRIPE_SIZE + 1, input_start..input_end)]
+            )
         );
 
         // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
         // store all logical sizes)
         let shard_identity = ShardIdentity::new(
             ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x1, vec![(0x1, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
+            (1, vec![(1, input_start..input_end)])
         );
     }
 
@@ -1284,12 +1299,8 @@ mod tests {
         );
 
         // Same, but using a sharded identity
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
         assert_eq!(
             do_fragment(input_start, input_end, &shard_identity, 0x8000),
             (u32::MAX, vec![(u32::MAX, input_start..input_end),])
@@ -1331,7 +1342,7 @@ mod tests {
                 ShardIdentity::new(
                     ShardNumber((prng.next_u32() % shard_count) as u8),
                     ShardCount::new(shard_count as u8),
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                    DEFAULT_STRIPE_SIZE,
                 )
                 .unwrap()
             };
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 8186889e10..34a419f2cf 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -26,7 +26,7 @@ use utils::{completion, serde_system_time};
 use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
 use crate::reltag::RelTag;
-use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
 
 /// The state of a tenant in this pageserver.
 ///
@@ -438,8 +438,6 @@ pub struct ShardParameters {
 }
 
 impl ShardParameters {
-    pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
-
     pub fn is_unsharded(&self) -> bool {
         self.count.is_unsharded()
     }
@@ -449,7 +447,7 @@ impl Default for ShardParameters {
     fn default() -> Self {
         Self {
             count: ShardCount::new(0),
-            stripe_size: Self::DEFAULT_STRIPE_SIZE,
+            stripe_size: DEFAULT_STRIPE_SIZE,
         }
     }
 }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index abbf4e6432..feb59f5070 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -92,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1);
 /// ShardIdentity uses a magic layout value to indicate if it is unusable
 const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 
-/// Default stripe size in pages: 256MiB divided by 8kiB page size.
-const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+/// The default stripe size in pages. 16 MiB divided by 8 kiB page size.
+///
+/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization.
+/// 16 MiB appears to be a reasonable balance: <https://github.com/neondatabase/neon/pull/10510>.
+pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8);
 
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
@@ -543,7 +546,7 @@ mod tests {
             field6: 0x7d06,
         };
 
-        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
+        let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key);
         assert_eq!(shard, ShardNumber(8));
     }
 
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 31ab443ccd..2311cadb36 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -800,7 +800,7 @@ impl ComputeHook {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
     use utils::id::TenantId;
 
     use super::*;
@@ -808,6 +808,7 @@ pub(crate) mod tests {
     #[test]
     fn tenant_updates() -> anyhow::Result<()> {
         let tenant_id = TenantId::generate();
+        let stripe_size = DEFAULT_STRIPE_SIZE;
         let mut tenant_state = ComputeHookTenant::new(
             TenantShardId {
                 tenant_id,
@@ -848,7 +849,7 @@ pub(crate) mod tests {
                 shard_count: ShardCount::new(2),
                 shard_number: ShardNumber(1),
             },
-            stripe_size: ShardStripeSize(32768),
+            stripe_size,
             preferred_az: None,
             node_id: NodeId(1),
         });
@@ -864,7 +865,7 @@ pub(crate) mod tests {
                 shard_count: ShardCount::new(2),
                 shard_number: ShardNumber(0),
             },
-            stripe_size: ShardStripeSize(32768),
+            stripe_size,
             preferred_az: None,
             node_id: NodeId(1),
         });
@@ -874,7 +875,7 @@ pub(crate) mod tests {
             anyhow::bail!("Wrong send result");
         };
         assert_eq!(request.shards.len(), 2);
-        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
+        assert_eq!(request.stripe_size, Some(stripe_size));
 
         // Simulate successful send
         *guard = Some(ComputeRemoteState {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index c1c2e2c189..2ef09cd2e3 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -43,7 +43,7 @@ use pageserver_api::models::{
     TimelineInfo, TopTenantShardItem, TopTenantShardsRequest,
 };
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+    DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::{
     ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
@@ -2754,7 +2754,7 @@ impl Service {
                         count: tenant_shard_id.shard_count,
                         // We only import un-sharded or single-sharded tenants, so stripe
                         // size can be made up arbitrarily here.
-                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                        stripe_size: DEFAULT_STRIPE_SIZE,
                     },
                     placement_policy: Some(placement_policy),
                     config: req.config.tenant_conf,
@@ -7865,7 +7865,7 @@ impl Service {
         // old, persisted stripe size.
         let new_stripe_size = match candidate.id.shard_count.count() {
             0 => panic!("invalid shard count 0"),
-            1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE),
+            1 => Some(DEFAULT_STRIPE_SIZE),
             2.. => None,
         };
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 8424c65aba..3a75e96cb2 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -2000,7 +2000,7 @@ pub(crate) mod tests {
     use std::rc::Rc;
 
     use pageserver_api::controller_api::NodeAvailability;
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
     use rand::SeedableRng;
     use rand::rngs::StdRng;
     use utils::id::TenantId;
@@ -2012,6 +2012,7 @@ pub(crate) mod tests {
         let tenant_id = TenantId::generate();
         let shard_number = ShardNumber(0);
         let shard_count = ShardCount::new(1);
+        let stripe_size = DEFAULT_STRIPE_SIZE;
 
         let tenant_shard_id = TenantShardId {
             tenant_id,
@@ -2020,12 +2021,7 @@ pub(crate) mod tests {
         };
         TenantShard::new(
             tenant_shard_id,
-            ShardIdentity::new(
-                shard_number,
-                shard_count,
-                pageserver_api::shard::ShardStripeSize(32768),
-            )
-            .unwrap(),
+            ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
             policy,
             None,
         )
@@ -2045,6 +2041,7 @@ pub(crate) mod tests {
         shard_count: ShardCount,
         preferred_az: Option<AvailabilityZone>,
     ) -> Vec<TenantShard> {
+        let stripe_size = DEFAULT_STRIPE_SIZE;
         (0..shard_count.count())
             .map(|i| {
                 let shard_number = ShardNumber(i);
@@ -2056,12 +2053,7 @@ pub(crate) mod tests {
                 };
                 TenantShard::new(
                     tenant_shard_id,
-                    ShardIdentity::new(
-                        shard_number,
-                        shard_count,
-                        pageserver_api::shard::ShardStripeSize(32768),
-                    )
-                    .unwrap(),
+                    ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
                     policy.clone(),
                     preferred_az.clone(),
                 )
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 0175794a57..ce73c9a738 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -677,7 +677,7 @@ def test_storage_controller_compute_hook(
     env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
     expect = {
         "tenant_id": str(env.initial_tenant),
-        "stripe_size": 32768,
+        "stripe_size": 2048,
         "shards": [
             {"node_id": int(env.pageservers[1].id), "shard_number": 0},
             {"node_id": int(env.pageservers[1].id), "shard_number": 1},

From d2825e72ad0c3b925dda69f8988b4da14ed12922 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 9 Apr 2025 14:17:45 +0200
Subject: [PATCH 083/140] Add is_stopping check around critical macro in
 walreceiver (#11496)

The timeline stopping state is set much earlier than the cancellation
token is fired, so by checking for the stopping state, we can prevent
races with timeline shutdown where we issue a cancellation error but the
cancellation token hasn't been fired yet.

Fix #11427.
---
 .../src/tenant/timeline/walreceiver/walreceiver_connection.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index f41a9cfe82..6bf05a0f86 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -445,7 +445,7 @@ pub(super) async fn handle_walreceiver_connection(
                         .inspect_err(|err| {
                             // TODO: we can't differentiate cancellation errors with
                             // anyhow::Error, so just ignore it if we're cancelled.
-                            if !cancellation.is_cancelled() {
+                            if !cancellation.is_cancelled() && !timeline.is_stopping() {
                                 critical!("{err:?}")
                             }
                         })?;
@@ -577,7 +577,7 @@ pub(super) async fn handle_walreceiver_connection(
                             .inspect_err(|err| {
                                 // TODO: we can't differentiate cancellation errors with
                                 // anyhow::Error, so just ignore it if we're cancelled.
-                                if !cancellation.is_cancelled() {
+                                if !cancellation.is_cancelled() && !timeline.is_stopping() {
                                     critical!("{err:?}")
                                 }
                             })?;

From ef8101a9be3ce80d104943238a7d608561432189 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Apr 2025 15:28:59 +0300
Subject: [PATCH 084/140] refactor: Split "communicator" routines to a separate
 source file (#11459)

pagestore_smgr.c had grown pretty large. Split into two parts, such
that the smgr routines that PostgreSQL code calls stays in
pagestore_smgr.c, and all the prefetching logic and other lower-level
routines related to communicating with the pageserver are moved to a
new source file, "communicator.c".

There are plans to replace communicator parts with a new
implementation. See https://github.com/neondatabase/neon/pull/10799.
This commit doesn't implement any of the new things yet, but it is
good preparation for it. I'm imagining that the new implementation
will approximately replace the current "communicator.c" code, exposing
roughly the same functions to pagestore_smgr.c.

This commit doesn't change any functionality or behavior, or make any
other changes to the existing code: It just moves existing code
around.
---
 pgxn/neon/Makefile           |    1 +
 pgxn/neon/communicator.c     | 2504 ++++++++++++++++++++++++++++++++++
 pgxn/neon/communicator.h     |   48 +
 pgxn/neon/neon.c             |    3 +-
 pgxn/neon/neon.h             |    1 -
 pgxn/neon/pagestore_client.h |    5 +
 pgxn/neon/pagestore_smgr.c   | 2459 +--------------------------------
 7 files changed, 2610 insertions(+), 2411 deletions(-)
 create mode 100644 pgxn/neon/communicator.c
 create mode 100644 pgxn/neon/communicator.h

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 8259d24359..426b176af9 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	communicator.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
new file mode 100644
index 0000000000..932034e22e
--- /dev/null
+++ b/pgxn/neon/communicator.c
@@ -0,0 +1,2504 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator.c
+ *	  Functions for communicating with remote pageservers.
+ *
+ * This is the so-called "legacy" communicator. It consists of functions that
+ * are called from the smgr implementation, in pagestore_smgr.c. There are
+ * plans to replace this with a different implementation, see RFC.
+ *
+ * The communicator is a collection of functions that are called in each
+ * backend, when the backend needs to read a page or other information. It
+ * does not spawn background threads or anything like that. To process
+ * responses to prefetch requests in a timely fashion, however, it registers
+ * a ProcessInterrupts hook that gets called periodically from any
+ * CHECK_FOR_INTERRUPTS() point in the backend.
+ *
+ * By the time the functions in this file are called, the caller has already
+ * established that a request to the pageserver is necessary. The functions
+ * are only called for permanent relations (i.e. not temp or unlogged tables).
+ * Before making a call to the communicator, the caller has already checked
+ * the relation size or local file cache.
+ *
+ * However, when processing responses to getpage requests, the communicator
+ * writes pages directly to the LFC.
+ *
+ * The communicator functions take request LSNs as arguments; the caller is
+ * responsible for determining the correct LSNs to use. There's one exception
+ * to that, in prefetch_do_request(); it sometimes calls back to
+ * neon_get_request_lsns().  That's because sometimes a suitable response is
+ * found in the prefetch buffer and the request LSns are not needed, and the
+ * caller doesn't know whether it's needed or not.
+ *
+ * The main interface consists of the following "synchronous" calls:
+ *
+ * communicator_exists			- Returns true if a relation file exists
+ * communicator_nblocks			- Returns a relation's size
+ * communicator_dbsize			- Returns a databases's total size
+ * communicator_read_at_lsnv	- Read contents of one relation block
+ * communicator_read_slru_segment - Read contents of one SLRU segment
+ *
+ * In addition, there functions related to prefetching:
+ * communicator_prefetch_register_bufferv - Start prefetching a page
+ * communicator_prefetch_lookupv - Check if a page is already in prefetch queue
+ *
+ * Misc other functions:
+ * - communicator_init			- Initialize the module at startup
+ * - communicator_prefetch_pump_state - Called periodically to advance the state
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "access/xlog_internal.h"
+#include "access/xlogutils.h"
+#include "common/hashfn.h"
+#include "executor/instrument.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "port/pg_iovec.h"
+#include "postmaster/interrupt.h"
+#include "replication/walsender.h"
+#include "utils/timeout.h"
+
+#include "bitmap.h"
+#include "communicator.h"
+#include "file_cache.h"
+#include "neon.h"
+#include "neon_perf_counters.h"
+#include "pagestore_client.h"
+
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogrecovery.h"
+#endif
+
+#if PG_VERSION_NUM < 160000
+typedef PGAlignedBlock PGIOAlignedBlock;
+#endif
+
+#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
+	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
+				   ##__VA_ARGS__)
+
+page_server_api *page_server;
+
+static uint32 local_request_counter;
+#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
+
+/*
+ * Various settings related to prompt (fast) handling of PageStream responses
+ * at any CHECK_FOR_INTERRUPTS point.
+ */
+int				readahead_getpage_pull_timeout_ms = 0;
+static int		PS_TIMEOUT_ID = 0;
+static bool		timeout_set = false;
+static bool		timeout_signaled = false;
+
+/*
+ * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want
+ * that to handle any getpage responses if we're already working on the
+ * backlog of those, as we'd hit issues with determining which prefetch slot
+ * we just got a response for.
+ *
+ * To protect against that, we have this variable that's set whenever we start
+ * receiving data for prefetch slots, so that we don't get confused.
+ *
+ * Note that in certain error cases during readpage we may leak r_r_g=true,
+ * which results in a failure to pick up further responses until we first
+ * actively try to receive new getpage responses.
+ */
+static bool		readpage_reentrant_guard = false;
+
+static void pagestore_timeout_handler(void);
+
+#define START_PREFETCH_RECEIVE_WORK() \
+	do { \
+		readpage_reentrant_guard = true; \
+	} while (false)
+
+#define END_PREFETCH_RECEIVE_WORK() \
+	do { \
+		readpage_reentrant_guard = false; \
+		if (unlikely(timeout_signaled && !InterruptPending)) \
+			InterruptPending = true; \
+	} while (false)
+
+/*
+ * Prefetch implementation:
+ *
+ * Prefetch is performed locally by each backend.
+ *
+ * There can be up to readahead_buffer_size active IO requests registered at
+ * any time. Requests using smgr_prefetch are sent to the pageserver, but we
+ * don't wait on the response. Requests using smgr_read are either read from
+ * the buffer, or (if that's not possible) we wait on the response to arrive -
+ * this also will allow us to receive other prefetched pages.
+ * Each request is immediately written to the output buffer of the pageserver
+ * connection, but may not be flushed if smgr_prefetch is used: pageserver
+ * flushes sent requests on manual flush, or every neon.flush_output_after
+ * unflushed requests; which is not necessarily always and all the time.
+ *
+ * Once we have received a response, this value will be stored in the response
+ * buffer, indexed in a hash table. This allows us to retain our buffered
+ * prefetch responses even when we have cache misses.
+ *
+ * Reading of prefetch responses is delayed until them are actually needed
+ * (smgr_read). In case of prefetch miss or any other SMGR request other than
+ * smgr_read, all prefetch responses in the pipeline will need to be read from
+ * the connection; the responses are stored for later use.
+ *
+ * NOTE: The current implementation of the prefetch system implements a ring
+ * buffer of up to readahead_buffer_size requests. If there are more _read and
+ * _prefetch requests between the initial _prefetch and the _read of a buffer,
+ * the prefetch request will have been dropped from this prefetch buffer, and
+ * your prefetch was wasted.
+ */
+
+/*
+ * State machine:
+ *
+ * not in hash : in hash
+ *             :
+ * UNUSED ------> REQUESTED --> RECEIVED
+ *   ^         :      |            |
+ *   |         :      v            |
+ *   |         : TAG_REMAINS       |
+ *   |         :      |            |
+ *   +----------------+------------+
+ *             :
+ */
+typedef enum PrefetchStatus
+{
+	PRFS_UNUSED = 0,			/* unused slot */
+	PRFS_REQUESTED,				/* request was written to the sendbuffer to
+								 * PS, but not necessarily flushed. all fields
+								 * except response valid */
+	PRFS_RECEIVED,				/* all fields valid */
+	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
+								 * valid */
+} PrefetchStatus;
+
+/* must fit in uint8; bits 0x1 are used */
+typedef enum {
+	PRFSF_NONE	= 0x0,
+	PRFSF_LFC	= 0x1  /* received prefetch result is stored in LFC */
+} PrefetchRequestFlags;
+
+typedef struct PrefetchRequest
+{
+	BufferTag	buftag;			/* must be first entry in the struct */
+	shardno_t	shard_no;
+	uint8		status;		/* see PrefetchStatus for valid values */
+	uint8		flags;		/* see PrefetchRequestFlags */
+	neon_request_lsns request_lsns;
+	NeonRequestId reqid;
+	NeonResponse *response;		/* may be null */
+	uint64		my_ring_index;
+} PrefetchRequest;
+
+/* prefetch buffer lookup hash table */
+
+typedef struct PrfHashEntry
+{
+	PrefetchRequest *slot;
+	uint32		status;
+	uint32		hash;
+} PrfHashEntry;
+
+#define SH_PREFIX			prfh
+#define SH_ELEMENT_TYPE		PrfHashEntry
+#define SH_KEY_TYPE			PrefetchRequest *
+#define SH_KEY				slot
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a)	((a)->hash)
+#define SH_HASH_KEY(tb, key) hash_bytes( \
+	((const unsigned char *) &(key)->buftag), \
+	sizeof(BufferTag) \
+)
+
+#define SH_EQUAL(tb, a, b)	(BufferTagsEqual(&(a)->buftag, &(b)->buftag))
+#define SH_SCOPE			static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+/*
+ * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
+ * It maintains a (ring) buffer of in-flight requests and responses.
+ *
+ * We maintain several indexes into the ring buffer:
+ * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
+ *
+ * ring_unused points to the first unused slot of the buffer
+ * ring_receive is the next request that is to be received
+ * ring_last is the oldest received entry in the buffer
+ *
+ * Apart from being an entry in the ring buffer of prefetch requests, each
+ * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
+ */
+typedef struct PrefetchState
+{
+	MemoryContext bufctx;		/* context for prf_buffer[].response
+								 * allocations */
+	MemoryContext errctx;		/* context for prf_buffer[].response
+								 * allocations */
+	MemoryContext hashctx;		/* context for prf_buffer */
+
+	/* buffer indexes */
+	uint64		ring_unused;	/* first unused slot */
+	uint64		ring_flush;		/* next request to flush */
+	uint64		ring_receive;	/* next slot that is to receive a response */
+	uint64		ring_last;		/* min slot with a response value */
+
+	/* metrics / statistics  */
+	int			n_responses_buffered;	/* count of PS responses not yet in
+										 * buffers */
+	int			n_requests_inflight;	/* count of PS requests considered in
+										 * flight */
+	int			n_unused;		/* count of buffers < unused, > last, that are
+								 * also unused */
+
+	/* the buffers */
+	prfh_hash	*prf_hash;
+	int			max_shard_no;
+	/* Mark shards involved in prefetch */
+	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
+	PrefetchRequest prf_buffer[];	/* prefetch buffers */
+} PrefetchState;
+
+static PrefetchState *MyPState;
+
+#define GetPrfSlotNoCheck(ring_index) ( \
+	&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+)
+
+#define GetPrfSlot(ring_index) ( \
+	( \
+		AssertMacro((ring_index) < MyPState->ring_unused && \
+					(ring_index) >= MyPState->ring_last), \
+		GetPrfSlotNoCheck(ring_index) \
+	) \
+)
+
+#define ReceiveBufferNeedsCompaction() (\
+	(MyPState->n_responses_buffered / 8) < ( \
+		MyPState->ring_receive - \
+			MyPState->ring_last - \
+			MyPState->n_responses_buffered \
+	) \
+)
+
+static process_interrupts_callback_t prev_interrupt_cb;
+
+static bool compact_prefetch_buffers(void);
+static void consume_prefetch_responses(void);
+static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+										BlockNumber nblocks, const bits8 *mask,
+										bool is_prefetch);
+static bool prefetch_read(PrefetchRequest *slot);
+static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
+static bool prefetch_wait_for(uint64 ring_index);
+static void prefetch_cleanup_trailing_unused(void);
+static inline void prefetch_set_unused(uint64 ring_index);
+
+static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns,
+										  PrefetchRequest *slot);
+static bool communicator_processinterrupts(void);
+
+void
+pg_init_communicator(void)
+{
+	prev_interrupt_cb = ProcessInterruptsCallback;
+	ProcessInterruptsCallback = communicator_processinterrupts;
+}
+
+static bool
+compact_prefetch_buffers(void)
+{
+	uint64		empty_ring_index = MyPState->ring_last;
+	uint64		search_ring_index = MyPState->ring_receive;
+	int			n_moved = 0;
+
+	if (MyPState->ring_receive == MyPState->ring_last)
+		return false;
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		search_ring_index--;
+		if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
+		{
+			empty_ring_index = search_ring_index;
+			break;
+		}
+	}
+
+	/*
+	 * Here we have established: slots < search_ring_index have an unknown
+	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
+	 * are unused slots > empty_ring_index are in use, or outside our buffer's
+	 * range. ... unless search_ring_index <= ring_last
+	 *
+	 * Therefore, there is a gap of at least one unused items between
+	 * search_ring_index and empty_ring_index (both inclusive), which grows as
+	 * we hit more unused items while moving backwards through the array.
+	 */
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		PrefetchRequest *source_slot;
+		PrefetchRequest *target_slot;
+		bool		found;
+
+		/* update search index to an unprocessed entry */
+		search_ring_index--;
+
+		source_slot = GetPrfSlot(search_ring_index);
+
+		if (source_slot->status == PRFS_UNUSED)
+			continue;
+
+		/* slot is used -- start moving slot */
+		target_slot = GetPrfSlot(empty_ring_index);
+
+		Assert(source_slot->status == PRFS_RECEIVED);
+		Assert(target_slot->status == PRFS_UNUSED);
+
+		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
+		target_slot->status = source_slot->status;
+		target_slot->flags = source_slot->flags;
+		target_slot->response = source_slot->response;
+		target_slot->reqid = source_slot->reqid;
+		target_slot->request_lsns = source_slot->request_lsns;
+		target_slot->my_ring_index = empty_ring_index;
+
+		prfh_delete(MyPState->prf_hash, source_slot);
+		prfh_insert(MyPState->prf_hash, target_slot, &found);
+
+		Assert(!found);
+
+		/* Adjust the location of our known-empty slot */
+		empty_ring_index--;
+
+		/* empty the moved slot */
+		source_slot->status = PRFS_UNUSED;
+		source_slot->buftag = (BufferTag)
+		{
+			0
+		};
+		source_slot->response = NULL;
+		source_slot->my_ring_index = 0;
+		source_slot->request_lsns = (neon_request_lsns) {
+			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
+		};
+
+		/* update bookkeeping */
+		n_moved++;
+	}
+
+	/*
+	 * Only when we've moved slots we can expect trailing unused slots, so
+	 * only then we clean up trailing unused slots.
+	 */
+	if (n_moved > 0)
+	{
+		prefetch_cleanup_trailing_unused();
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * If there might be responses still in the TCP buffer, then we should try to
+ * use those, to reduce any TCP backpressure on the OS/PS side.
+ *
+ * This procedure handles that.
+ *
+ * Note that this works because we don't pipeline non-getPage requests.
+ *
+ * NOTE: This procedure is not allowed to throw errors that should be handled
+ * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS
+ * point inside and outside PostgreSQL.
+ *
+ * This still does throw errors when it receives malformed responses from PS.
+ *
+ * When we're not called from CHECK_FOR_INTERRUPTS (indicated by
+ * IsHandlingInterrupts) we also report we've ended prefetch receive work,
+ * just in case state tracking was lost due to an error in the sync getPage
+ * response code.
+ */
+void
+communicator_prefetch_pump_state(bool IsHandlingInterrupts)
+{
+	while (MyPState->ring_receive != MyPState->ring_flush)
+	{
+		NeonResponse   *response;
+		PrefetchRequest *slot;
+		MemoryContext	old;
+
+		slot = GetPrfSlot(MyPState->ring_receive);
+
+		old = MemoryContextSwitchTo(MyPState->errctx);
+		response = page_server->try_receive(slot->shard_no);
+		MemoryContextSwitchTo(old);
+
+		if (response == NULL)
+			break;
+
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(slot->shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+
+		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
+		{
+			/*
+			 * Store prefetched result in LFC (please read comments to lfc_prefetch
+			 * explaining why it can be done without holding shared buffer lock
+			 */
+			if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
+			{
+				slot->flags |= PRFSF_LFC;
+			}
+		}
+	}
+
+	/* We never pump the prefetch state while handling other pages */
+	if (!IsHandlingInterrupts)
+		END_PREFETCH_RECEIVE_WORK();
+
+	communicator_reconfigure_timeout_if_needed();
+}
+
+void
+readahead_buffer_resize(int newsize, void *extra)
+{
+	uint64		end,
+				nfree = newsize;
+	PrefetchState *newPState;
+	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) +
+		(sizeof(PrefetchRequest) * newsize);
+
+	/* don't try to re-initialize if we haven't initialized yet */
+	if (MyPState == NULL)
+		return;
+
+	/*
+	 * Make sure that we don't lose track of active prefetch requests by
+	 * ensuring we have received all but the last n requests (n = newsize).
+	 */
+	if (MyPState->n_requests_inflight > newsize)
+	{
+		prefetch_wait_for(MyPState->ring_unused - newsize - 1);
+		Assert(MyPState->n_requests_inflight <= newsize);
+	}
+
+	/* construct the new PrefetchState, and copy over the memory contexts */
+	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
+
+	newPState->bufctx = MyPState->bufctx;
+	newPState->errctx = MyPState->errctx;
+	newPState->hashctx = MyPState->hashctx;
+	newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
+	newPState->n_unused = newsize;
+	newPState->n_requests_inflight = 0;
+	newPState->n_responses_buffered = 0;
+	newPState->ring_last = newsize;
+	newPState->ring_unused = newsize;
+	newPState->ring_receive = newsize;
+	newPState->max_shard_no = MyPState->max_shard_no;
+	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
+
+	/*
+	 * Copy over the prefetches.
+	 *
+	 * We populate the prefetch array from the end; to retain the most recent
+	 * prefetches, but this has the benefit of only needing to do one
+	 * iteration on the dataset, and trivial compaction.
+	 */
+	for (end = MyPState->ring_unused - 1;
+		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
+		 end -= 1)
+	{
+		PrefetchRequest *slot = GetPrfSlot(end);
+		PrefetchRequest *newslot;
+		bool		found;
+
+		if (slot->status == PRFS_UNUSED)
+			continue;
+
+		nfree -= 1;
+
+		newslot = &newPState->prf_buffer[nfree];
+		*newslot = *slot;
+		newslot->my_ring_index = nfree;
+
+		prfh_insert(newPState->prf_hash, newslot, &found);
+
+		Assert(!found);
+
+		switch (newslot->status)
+		{
+			case PRFS_UNUSED:
+				pg_unreachable();
+			case PRFS_REQUESTED:
+				newPState->n_requests_inflight += 1;
+				newPState->ring_receive -= 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_RECEIVED:
+				newPState->n_responses_buffered += 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_TAG_REMAINS:
+				newPState->ring_last -= 1;
+				break;
+		}
+		newPState->n_unused -= 1;
+	}
+	newPState->ring_flush = newPState->ring_receive;
+
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+
+	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
+	{
+		PrefetchRequest *slot = GetPrfSlot(end);
+		Assert(slot->status != PRFS_REQUESTED);
+		if (slot->status == PRFS_RECEIVED)
+		{
+			pfree(slot->response);
+		}
+	}
+
+	prfh_destroy(MyPState->prf_hash);
+	pfree(MyPState);
+	MyPState = newPState;
+}
+
+
+
+/*
+ * Make sure that there are no responses still in the buffer.
+ *
+ * This function may indirectly update MyPState->pfs_hash; which invalidates
+ * any active pointers into the hash table.
+ */
+static void
+consume_prefetch_responses(void)
+{
+	if (MyPState->ring_receive < MyPState->ring_unused)
+		prefetch_wait_for(MyPState->ring_unused - 1);
+}
+
+static void
+prefetch_cleanup_trailing_unused(void)
+{
+	uint64		ring_index;
+	PrefetchRequest *slot;
+
+	while (MyPState->ring_last < MyPState->ring_receive)
+	{
+		ring_index = MyPState->ring_last;
+		slot = GetPrfSlot(ring_index);
+
+		if (slot->status == PRFS_UNUSED)
+			MyPState->ring_last += 1;
+		else
+			break;
+	}
+}
+
+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
+/*
+ * Wait for slot of ring_index to have received its response.
+ * The caller is responsible for making sure the request buffer is flushed.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ * NOTE: callers should make sure they can handle query cancellations in this
+ * function's call path.
+ */
+static bool
+prefetch_wait_for(uint64 ring_index)
+{
+	PrefetchRequest *entry;
+	bool		result = true;
+
+	if (MyPState->ring_flush <= ring_index &&
+		MyPState->ring_unused > MyPState->ring_flush)
+	{
+		if (!prefetch_flush_requests())
+			return false;
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
+	Assert(MyPState->ring_unused > ring_index);
+
+	while (MyPState->ring_receive <= ring_index)
+	{
+		START_PREFETCH_RECEIVE_WORK();
+		entry = GetPrfSlot(MyPState->ring_receive);
+
+		Assert(entry->status == PRFS_REQUESTED);
+		if (!prefetch_read(entry))
+		{
+			result = false;
+			break;
+		}
+
+		END_PREFETCH_RECEIVE_WORK();
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	return result;
+}
+
+/*
+ * Read the response of a prefetch request into its slot.
+ *
+ * The caller is responsible for making sure that the request for this buffer
+ * was flushed to the PageServer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ *
+ * NOTE: this does IO, and can get canceled out-of-line.
+ */
+static bool
+prefetch_read(PrefetchRequest *slot)
+{
+	NeonResponse *response;
+	MemoryContext old;
+	BufferTag	buftag;
+	shardno_t	shard_no;
+	uint64		my_ring_index;
+
+	Assert(slot->status == PRFS_REQUESTED);
+	Assert(slot->response == NULL);
+	Assert(slot->my_ring_index == MyPState->ring_receive);
+
+	if (slot->status != PRFS_REQUESTED ||
+		slot->response != NULL ||
+		slot->my_ring_index != MyPState->ring_receive)
+		neon_shard_log(slot->shard_no, ERROR,
+					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   slot->status, slot->response,
+					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+
+	/*
+	 * Copy the request info so that if an error happens and the prefetch
+	 * queue is flushed during the receive call, we can print the original
+	 * values in the error message
+	 */
+	buftag = slot->buftag;
+	shard_no = slot->shard_no;
+	my_ring_index = slot->my_ring_index;
+
+	old = MemoryContextSwitchTo(MyPState->errctx);
+	response = (NeonResponse *) page_server->receive(shard_no);
+	MemoryContextSwitchTo(old);
+	if (response)
+	{
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+
+		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
+		{
+			/*
+			 * Store prefetched result in LFC (please read comments to lfc_prefetch
+			 * explaining why it can be done without holding shared buffer lock
+			 */
+			if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
+			{
+				slot->flags |= PRFSF_LFC;
+			}
+		}
+		return true;
+	}
+	else
+	{
+		/*
+		 * Note: The slot might no longer be valid, if the connection was lost
+		 * and the prefetch queue was flushed during the receive call
+		 */
+		neon_shard_log(shard_no, LOG,
+					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   (long) my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
+					   buftag.forkNum, buftag.blockNum);
+		return false;
+	}
+}
+
+/*
+ * Disconnect hook - drop prefetches when the connection drops
+ *
+ * If we don't remove the failed prefetches, we'd be serving incorrect
+ * data to the smgr.
+ */
+void
+prefetch_on_ps_disconnect(void)
+{
+	MyPState->ring_flush = MyPState->ring_unused;
+
+	while (MyPState->ring_receive < MyPState->ring_unused)
+	{
+		PrefetchRequest *slot;
+		uint64		ring_index = MyPState->ring_receive;
+
+		slot = GetPrfSlot(ring_index);
+
+		Assert(slot->status == PRFS_REQUESTED);
+		Assert(slot->my_ring_index == ring_index);
+
+		/*
+		 * Drop connection to all shards which have prefetch requests.
+		 * It is not a problem to call disconnect multiple times on the same connection
+		 * because disconnect implementation in libpagestore.c will check if connection
+		 * is alive and do nothing of connection was already dropped.
+		 */
+		page_server->disconnect(slot->shard_no);
+
+		/* clean up the request */
+		slot->status = PRFS_TAG_REMAINS;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+
+		prefetch_set_unused(ring_index);
+		pgBufferUsage.prefetch.expired += 1;
+		MyNeonCounters->getpage_prefetch_discards_total += 1;
+	}
+
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+}
+
+/*
+ * prefetch_set_unused() - clear a received prefetch slot
+ *
+ * The slot at ring_index must be a current member of the ring buffer,
+ * and may not be in the PRFS_REQUESTED state.
+ *
+ * NOTE: this function will update MyPState->pfs_hash; which invalidates any
+ * active pointers into the hash table.
+ */
+static inline void
+prefetch_set_unused(uint64 ring_index)
+{
+	PrefetchRequest *slot;
+
+	if (ring_index < MyPState->ring_last)
+		return;					/* Should already be unused */
+
+	slot = GetPrfSlot(ring_index);
+	if (slot->status == PRFS_UNUSED)
+		return;
+
+	Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
+
+	if (slot->status == PRFS_RECEIVED)
+	{
+		pfree(slot->response);
+		slot->response = NULL;
+
+		MyPState->n_responses_buffered -= 1;
+		MyPState->n_unused += 1;
+
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+	}
+	else
+	{
+		Assert(slot->response == NULL);
+	}
+
+	prfh_delete(MyPState->prf_hash, slot);
+
+	/* clear all fields */
+	MemSet(slot, 0, sizeof(PrefetchRequest));
+	slot->status = PRFS_UNUSED;
+
+	/* run cleanup if we're holding back ring_last */
+	if (MyPState->ring_last == ring_index)
+		prefetch_cleanup_trailing_unused();
+
+	/*
+	 * ... and try to store the buffered responses more compactly if > 12.5%
+	 * of the buffer is gaps
+	 */
+	else if (ReceiveBufferNeedsCompaction())
+		compact_prefetch_buffers();
+}
+
+/*
+ * Send one prefetch request to the pageserver. To wait for the response, call
+ * prefetch_wait_for().
+ */
+static void
+prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
+{
+	bool		found;
+	uint64		mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
+
+	NeonGetPageRequest request = {
+		.hdr.tag = T_NeonGetPageRequest,
+		.hdr.reqid = GENERATE_REQUEST_ID(),
+		/* lsn and not_modified_since are filled in below */
+		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
+		.forknum = slot->buftag.forkNum,
+		.blkno = slot->buftag.blockNum,
+	};
+
+	Assert(mySlotNo == MyPState->ring_unused);
+
+	slot->reqid = request.hdr.reqid;
+
+	if (force_request_lsns)
+		slot->request_lsns = *force_request_lsns;
+	else
+		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+							  slot->buftag.forkNum, slot->buftag.blockNum,
+							  &slot->request_lsns, 1);
+	request.hdr.lsn = slot->request_lsns.request_lsn;
+	request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
+
+	Assert(slot->response == NULL);
+	Assert(slot->my_ring_index == MyPState->ring_unused);
+
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
+	{
+		Assert(mySlotNo == MyPState->ring_unused);
+		/* loop */
+	}
+
+	/* update prefetch state */
+	MyPState->n_requests_inflight += 1;
+	MyPState->n_unused -= 1;
+	MyPState->ring_unused += 1;
+	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
+
+	/* update slot state */
+	slot->status = PRFS_REQUESTED;
+	prfh_insert(MyPState->prf_hash, slot, &found);
+	Assert(!found);
+}
+
+/*
+ * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted.
+ * Present pages are marked in "mask" bitmap and total number of such pages is returned.
+ */
+int
+communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum,
+							  neon_request_lsns *lsns, BlockNumber nblocks,
+							  void **buffers, bits8 *mask)
+{
+	int hits = 0;
+	PrefetchRequest hashkey;
+
+	/*
+	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
+	 * correct alignment and that the padding bytes are cleared.
+	 */
+	memset(&hashkey.buftag, 0, sizeof(BufferTag));
+	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
+	hashkey.buftag.forkNum = forknum;
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		PrfHashEntry *entry;
+
+		hashkey.buftag.blockNum = blocknum + i;
+		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+
+		if (entry != NULL)
+		{
+			PrefetchRequest *slot = entry->slot;
+			uint64 ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));
+
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
+			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
+
+			if (slot->status != PRFS_RECEIVED)
+				continue;
+
+			/*
+			 * If the caller specified a request LSN to use, only accept
+			 * prefetch responses that satisfy that request.
+			 */
+			if (!neon_prefetch_response_usable(&lsns[i], slot))
+				continue;
+
+			/*
+			 * Ignore errors
+			 */
+			if (slot->response->tag != T_NeonGetPageResponse)
+			{
+				if (slot->response->tag != T_NeonErrorResponse)
+				{
+					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
+				}
+				continue;
+			}
+			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
+
+
+			/*
+			 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
+			 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
+			 * under buffer lock.
+			 */
+			if (!lfc_store_prefetch_result)
+				lfc_write(rinfo, forknum, blocknum + i, buffers[i]);
+
+			prefetch_set_unused(ring_index);
+			BITMAP_SET(mask, i);
+
+			hits += 1;
+			inc_getpage_wait(0);
+		}
+	}
+	pgBufferUsage.prefetch.hits += hits;
+	return hits;
+}
+
+/*
+ * prefetch_register_bufferv() - register and prefetch buffers
+ *
+ * Register that we may want the contents of BufferTag in the near future.
+ * This is used when issuing a speculative prefetch request, but also when
+ * performing a synchronous request and need the buffer right now.
+ *
+ * If force_request_lsns is not NULL, those values are sent to the
+ * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
+ * to calculate the LSNs to send.
+ *
+ * Bits set in *mask (if present) indicate pages already read; i.e. pages we
+ * can skip in this process.
+ *
+ * When performing a prefetch rather than a synchronous request,
+ * is_prefetch==true. Currently, it only affects how the request is accounted
+ * in the perf counters.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ */
+void
+communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+									   BlockNumber nblocks, const bits8 *mask)
+{
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
+
+	ring_index = prefetch_register_bufferv(tag, frlsns, nblocks, mask, true);
+
+	Assert(ring_index < MyPState->ring_unused &&
+		   MyPState->ring_last <= ring_index);
+}
+
+/* internal version. Returns the ring index */
+static uint64
+prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+						  BlockNumber nblocks, const bits8 *mask,
+						  bool is_prefetch)
+{
+	uint64		min_ring_index;
+	PrefetchRequest hashkey;
+#ifdef USE_ASSERT_CHECKING
+	bool		any_hits = false;
+#endif
+	/* We will never read further ahead than our buffer can store. */
+	nblocks = Max(1, Min(nblocks, readahead_buffer_size));
+
+	/*
+	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
+	 * correct alignment and that the padding bytes are cleared.
+	 */
+	memset(&hashkey.buftag, 0, sizeof(BufferTag));
+	hashkey.buftag = tag;
+
+Retry:
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+
+	min_ring_index = UINT64_MAX;
+	for (int i = 0; i < nblocks; i++)
+	{
+		PrefetchRequest *slot = NULL;
+		PrfHashEntry *entry = NULL;
+		uint64		ring_index;
+		neon_request_lsns *lsns;
+
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
+			continue;
+
+		if (frlsns)
+			lsns = &frlsns[i];
+		else
+			lsns = NULL;
+
+#ifdef USE_ASSERT_CHECKING
+		any_hits = true;
+#endif
+
+		slot = NULL;
+		entry = NULL;
+
+		hashkey.buftag.blockNum = tag.blockNum + i;
+		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+
+		if (entry != NULL)
+		{
+			slot = entry->slot;
+			ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));
+
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
+			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
+
+			/*
+			 * If the caller specified a request LSN to use, only accept
+			 * prefetch responses that satisfy that request.
+			 */
+			if (!is_prefetch)
+			{
+				if (!neon_prefetch_response_usable(lsns, slot))
+				{
+					/* Wait for the old request to finish and discard it */
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+					pgBufferUsage.prefetch.expired += 1;
+					MyNeonCounters->getpage_prefetch_discards_total += 1;
+				}
+			}
+
+			if (entry != NULL)
+			{
+				/*
+				 * We received a prefetch for a page that was recently read
+				 * and removed from the buffers. Remove that request from the
+				 * buffers.
+				 */
+				if (slot->status == PRFS_TAG_REMAINS)
+				{
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+				}
+				else
+				{
+					min_ring_index = Min(min_ring_index, ring_index);
+					/* The buffered request is good enough, return that index */
+					if (is_prefetch)
+						pgBufferUsage.prefetch.duplicates++;
+					continue;
+				}
+			}
+		}
+		else if (!is_prefetch)
+		{
+			pgBufferUsage.prefetch.misses += 1;
+			MyNeonCounters->getpage_prefetch_misses_total++;
+		}
+		/*
+		 * We can only leave the block above by finding that there's
+		 * no entry that can satisfy this request, either because there
+		 * was no entry, or because the entry was invalid or didn't satisfy
+		 * the LSNs provided.
+		 *
+		 * The code should've made sure to clear up the data.
+		 */
+		Assert(entry == NULL);
+		Assert(slot == NULL);
+
+		/* There should be no buffer overflow */
+		Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused);
+
+		/*
+		 * If the prefetch queue is full, we need to make room by clearing the
+		 * oldest slot. If the oldest slot holds a buffer that was already
+		 * received, we can just throw it away; we fetched the page
+		 * unnecessarily in that case. If the oldest slot holds a request that
+		 * we haven't received a response for yet, we have to wait for the
+		 * response to that before we can continue. We might not have even
+		 * flushed the request to the pageserver yet, it might be just sitting
+		 * in the output buffer. In that case, we flush it and wait for the
+		 * response. (We could decide not to send it, but it's hard to abort
+		 * when the request is already in the output buffer, and 'not sending'
+		 * a prefetch request kind of goes against the principles of
+		 * prefetching)
+		 */
+		if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused)
+		{
+			uint64		cleanup_index = MyPState->ring_last;
+
+			slot = GetPrfSlot(cleanup_index);
+
+			Assert(slot->status != PRFS_UNUSED);
+
+			/*
+			 * If there is good reason to run compaction on the prefetch buffers,
+			 * try to do that.
+			 */
+			if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
+			{
+				Assert(slot->status == PRFS_UNUSED);
+			}
+			else
+			{
+				/*
+				 * We have the slot for ring_last, so that must still be in
+				 * progress
+				 */
+				switch (slot->status)
+				{
+					case PRFS_REQUESTED:
+						Assert(MyPState->ring_receive == cleanup_index);
+						if (!prefetch_wait_for(cleanup_index))
+							goto Retry;
+						prefetch_set_unused(cleanup_index);
+						pgBufferUsage.prefetch.expired += 1;
+						MyNeonCounters->getpage_prefetch_discards_total += 1;
+						break;
+					case PRFS_RECEIVED:
+					case PRFS_TAG_REMAINS:
+						prefetch_set_unused(cleanup_index);
+						pgBufferUsage.prefetch.expired += 1;
+						MyNeonCounters->getpage_prefetch_discards_total += 1;
+						break;
+					default:
+						pg_unreachable();
+				}
+			}
+		}
+
+		/*
+		 * The next buffer pointed to by `ring_unused` is now definitely empty, so
+		 * we can insert the new request to it.
+		 */
+		ring_index = MyPState->ring_unused;
+
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index <= MyPState->ring_unused);
+
+		slot = GetPrfSlotNoCheck(ring_index);
+
+		Assert(slot->status == PRFS_UNUSED);
+
+		/*
+		 * We must update the slot data before insertion, because the hash
+		 * function reads the buffer tag from the slot.
+		 */
+		slot->buftag = hashkey.buftag;
+		slot->shard_no = get_shard_number(&tag);
+		slot->my_ring_index = ring_index;
+		slot->flags = 0;
+
+		min_ring_index = Min(min_ring_index, ring_index);
+
+		if (is_prefetch)
+			MyNeonCounters->getpage_prefetch_requests_total++;
+		else
+			MyNeonCounters->getpage_sync_requests_total++;
+
+		prefetch_do_request(slot, lsns);
+	}
+
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+
+	Assert(any_hits);
+
+	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
+		   GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
+	Assert(MyPState->ring_last <= min_ring_index &&
+		   min_ring_index < MyPState->ring_unused);
+
+	if (flush_every_n_requests > 0 &&
+		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
+	{
+		if (!prefetch_flush_requests())
+		{
+			/*
+			 * Prefetch set is reset in case of error, so we should try to
+			 * register our request once again
+			 */
+			goto Retry;
+		}
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
+	return min_ring_index;
+}
+
+static bool
+equal_requests(NeonRequest* a, NeonRequest* b)
+{
+	return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
+}
+
+
+/*
+ * Note: this function can get canceled and use a long jump to the next catch
+ * context. Take care.
+ */
+static NeonResponse *
+page_server_request(void const *req)
+{
+	NeonResponse *resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (messageTag(req))
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
+	}
+	shard_no = get_shard_number(&tag);
+
+	/*
+	 * Current sharding model assumes that all metadata is present only at shard 0.
+	 * We still need to call get_shard_no() to check if shard map is up-to-date.
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest)
+	{
+		shard_no = 0;
+	}
+
+	do
+	{
+		PG_TRY();
+		{
+			while (!page_server->send(shard_no, (NeonRequest *) req)
+				   || !page_server->flush(shard_no))
+			{
+				/* do nothing */
+			}
+			MyNeonCounters->pageserver_open_requests++;
+			consume_prefetch_responses();
+			resp = page_server->receive(shard_no);
+			MyNeonCounters->pageserver_open_requests--;
+		}
+		PG_CATCH();
+		{
+			/*
+			 * Cancellation in this code needs to be handled better at some
+			 * point, but this currently seems fine for now.
+			 */
+			page_server->disconnect(shard_no);
+			MyNeonCounters->pageserver_open_requests = 0;
+
+			/*
+			 * We know for sure we're not working on any prefetch pages after
+			 * this.
+			 */
+			END_PREFETCH_RECEIVE_WORK();
+
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+
+	} while (resp == NULL);
+
+	return resp;
+}
+
+
+StringInfoData
+nm_pack_request(NeonRequest *msg)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+
+	pq_sendbyte(&s, msg->tag);
+	if (neon_protocol_version >= 3)
+	{
+		pq_sendint64(&s, msg->reqid);
+	}
+	pq_sendint64(&s, msg->lsn);
+	pq_sendint64(&s, msg->not_modified_since);
+
+	switch (messageTag(msg))
+	{
+			/* pagestore_client -> pagestore */
+		case T_NeonExistsRequest:
+			{
+				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
+
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
+				pq_sendbyte(&s, msg_req->forknum);
+
+				break;
+			}
+		case T_NeonNblocksRequest:
+			{
+				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
+
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
+				pq_sendbyte(&s, msg_req->forknum);
+
+				break;
+			}
+		case T_NeonDbSizeRequest:
+			{
+				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
+
+				pq_sendint32(&s, msg_req->dbNode);
+
+				break;
+			}
+		case T_NeonGetPageRequest:
+			{
+				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
+
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
+				pq_sendbyte(&s, msg_req->forknum);
+				pq_sendint32(&s, msg_req->blkno);
+
+				break;
+			}
+
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->kind);
+				pq_sendint32(&s, msg_req->segno);
+
+				break;
+			}
+
+			/* pagestore -> pagestore_client. We never need to create these. */
+		case T_NeonExistsResponse:
+		case T_NeonNblocksResponse:
+		case T_NeonGetPageResponse:
+		case T_NeonErrorResponse:
+		case T_NeonDbSizeResponse:
+		case T_NeonGetSlruSegmentResponse:
+		default:
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
+			break;
+	}
+	return s;
+}
+
+NeonResponse *
+nm_unpack_response(StringInfo s)
+{
+	NeonMessageTag tag = pq_getmsgbyte(s);
+	NeonResponse resp_hdr = {0}; /* make valgrind happy */
+	NeonResponse *resp = NULL;
+
+	resp_hdr.tag = tag;
+	if (neon_protocol_version >= 3)
+	{
+		resp_hdr.reqid = pq_getmsgint64(s);
+		resp_hdr.lsn = pq_getmsgint64(s);
+		resp_hdr.not_modified_since = pq_getmsgint64(s);
+	}
+	switch (tag)
+	{
+			/* pagestore -> pagestore_client */
+		case T_NeonExistsResponse:
+			{
+				NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				msg_resp->exists = pq_getmsgbyte(s);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonNblocksResponse:
+			{
+				NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				msg_resp->n_blocks = pq_getmsgint(s, 4);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonGetPageResponse:
+			{
+				NeonGetPageResponse *msg_resp;
+
+				msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+					msg_resp->req.blkno = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				/* XXX:	should be varlena */
+				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
+				pq_getmsgend(s);
+
+				Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonDbSizeResponse:
+			{
+				NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					msg_resp->req.dbNode = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				msg_resp->db_size = pq_getmsgint64(s);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonErrorResponse:
+			{
+				NeonErrorResponse *msg_resp;
+				size_t		msglen;
+				const char *msgtext;
+
+				msgtext = pq_getmsgrawstring(s);
+				msglen = strlen(msgtext);
+
+				msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
+				msg_resp->req = resp_hdr;
+				memcpy(msg_resp->message, msgtext, msglen + 1);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonGetSlruSegmentResponse:
+		    {
+				NeonGetSlruSegmentResponse *msg_resp;
+				int n_blocks;
+				msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					msg_resp->req.kind = pq_getmsgbyte(s);
+					msg_resp->req.segno = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
+
+				n_blocks = pq_getmsgint(s, 4);
+				msg_resp->n_blocks = n_blocks;
+				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+			/*
+			 * pagestore_client -> pagestore
+			 *
+			 * We create these ourselves, and don't need to decode them.
+			 */
+		case T_NeonExistsRequest:
+		case T_NeonNblocksRequest:
+		case T_NeonGetPageRequest:
+		case T_NeonDbSizeRequest:
+		case T_NeonGetSlruSegmentRequest:
+		default:
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
+			break;
+	}
+
+	return resp;
+}
+
+/* dump to json for debugging / error reporting purposes */
+char *
+nm_to_string(NeonMessage *msg)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+
+	switch (messageTag(msg))
+	{
+			/* pagestore_client -> pagestore */
+		case T_NeonExistsRequest:
+			{
+				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
+		case T_NeonNblocksRequest:
+			{
+				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
+		case T_NeonGetPageRequest:
+			{
+				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonDbSizeRequest:
+			{
+				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
+				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
+				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
+				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+			/* pagestore -> pagestore_client */
+		case T_NeonExistsResponse:
+			{
+				NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\"");
+				appendStringInfo(&s, ", \"exists\": %d}",
+								 msg_resp->exists);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_NeonNblocksResponse:
+			{
+				NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_NeonGetPageResponse:
+			{
+#if 0
+				NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg;
+#endif
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\"");
+				appendStringInfo(&s, ", \"page\": \"XXX\"}");
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonErrorResponse:
+			{
+				NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg;
+
+				/* FIXME: escape double-quotes in the message */
+				appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\"");
+				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonDbSizeResponse:
+			{
+				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
+				appendStringInfo(&s, ", \"db_size\": %ld}",
+								 msg_resp->db_size);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_NeonGetSlruSegmentResponse:
+			{
+				NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+
+		default:
+			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
+	}
+	return s.data;
+}
+
+/*
+ *	communicator_init() -- Initialize per-backend private state
+ */
+void
+communicator_init(void)
+{
+	Size		prfs_size;
+
+	if (MyPState != NULL)
+		return;
+
+	/*
+	 * Sanity check that theperf counters array is sized correctly. We got
+	 * this wrong once, and the formula for max number of backends and aux
+	 * processes might well change in the future, so better safe than sorry.
+	 * This is a very cheap check so we do it even without assertions.  On
+	 * v14, this gets called before initializing MyProc, so we cannot perform
+	 * the check here. That's OK, we don't expect the logic to change in old
+	 * releases.
+	 */
+#if PG_VERSION_NUM>=150000
+	if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS])
+		elog(ERROR, "MyNeonCounters points past end of array");
+#endif
+
+	prfs_size = offsetof(PrefetchState, prf_buffer) +
+		sizeof(PrefetchRequest) * readahead_buffer_size;
+
+	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
+
+	MyPState->n_unused = readahead_buffer_size;
+
+	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
+										 "NeonSMGR/prefetch",
+										 SLAB_DEFAULT_BLOCK_SIZE * 17,
+										 PS_GETPAGERESPONSE_SIZE);
+	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
+											 "NeonSMGR/errors",
+											 ALLOCSET_DEFAULT_SIZES);
+	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
+											  "NeonSMGR/prefetch",
+											  ALLOCSET_DEFAULT_SIZES);
+
+	MyPState->prf_hash = prfh_create(MyPState->hashctx,
+									 readahead_buffer_size, NULL);
+}
+
+/*
+ *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
+ *
+ * This is used to check if the response to a prefetch request can be used to
+ * satisfy a page read now.
+ */
+static bool
+neon_prefetch_response_usable(neon_request_lsns *request_lsns,
+							  PrefetchRequest *slot)
+{
+	/* sanity check the LSN's on the old and the new request */
+	Assert(request_lsns->request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn);
+	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
+	Assert(slot->status != PRFS_UNUSED);
+
+	/*
+	 * The new request's LSN should never be older than the old one.  This
+	 * could be an Assert, except that for testing purposes, we do provide an
+	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
+	 * violates this.
+	 *
+	 * Similarly, the not_modified_since value calculated for a page should
+	 * never move backwards. This assumption is a bit fragile; if we updated
+	 * the last-written cache when we read in a page, for example, then it
+	 * might. But as the code stands, it should not.
+	 *
+	 * (If two backends issue a request at the same time, they might race and
+	 * calculate LSNs "out of order" with each other, but the prefetch queue
+	 * is backend-private at the moment.)
+	 */
+	if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn ||
+		request_lsns->not_modified_since < slot->request_lsns.not_modified_since)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_IO_ERROR),
+				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
+				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
+						   LSN_FORMAT_ARGS(request_lsns->effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns->not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
+		return false;
+	}
+
+	/*---
+	 * Each request to the pageserver has three LSN values associated with it:
+	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
+	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
+	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
+	 * we remember `effective_request_lsn` separately. In a primary,
+	 * `effective_request_lsn` is the same as  `not_modified_since`.
+	 * See comments in neon_get_request_lsns why we can not use last flush WAL position here.
+	 *
+	 * To determine whether a response to a GetPage request issued earlier is
+	 * still valid to satisfy a new page read, we look at the
+	 * (not_modified_since, effective_request_lsn] range of the request. It is
+	 * effectively a claim that the page has not been modified between those
+	 * LSNs.  If the range of the old request in the queue overlaps with the
+	 * new request, we know that the page hasn't been modified in the union of
+	 * the ranges. We can use the response to old request to satisfy the new
+	 * request in that case. For example:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                     400      800
+	 * New request:         +--------+
+	 *
+	 * The old request claims that the page was not modified between LSNs 100
+	 * and 500, and the second claims that it was not modified between 400 and
+	 * 800. Together they mean that the page was not modified between 100 and
+	 * 800. Therefore the response to the old request is also valid for the
+	 * new request.
+	 *
+	 * This logic also holds at the boundary case that the old request's LSN
+	 * matches the new request's not_modified_since LSN exactly:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                       500      900
+	 * New request:           +--------+
+	 *
+	 * The response to the old request is the page as it was at LSN 500, and
+	 * the page hasn't been changed in the range (500, 900], therefore the
+	 * response is valid also for the new request.
+	 */
+
+	/* this follows from the checks above */
+	Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since);
+
+	return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn;
+}
+
+/*
+ *	Does the physical file exist?
+ */
+bool
+communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *request_lsns)
+{
+	bool		exists;
+	NeonResponse *resp;
+
+	{
+		NeonExistsRequest request = {
+			.hdr.tag = T_NeonExistsRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns->request_lsn,
+			.hdr.not_modified_since = request_lsns->not_modified_since,
+			.rinfo = rinfo,
+			.forknum = forkNum
+		};
+
+		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonExistsResponse:
+			{
+				NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
+						exists_resp->req.forknum != request.forknum)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
+					}
+				}
+				exists = exists_resp->exists;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								resp->reqid,
+								RelFileInfoFmt(rinfo),
+								forkNum,
+								LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
+											T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
+		}
+		pfree(resp);
+	}
+	return exists;
+}
+
+/*
+ * Read N pages at a specific LSN.
+ *
+ * *mask is set for pages read at a previous point in time, and which we
+ * should not touch, nor overwrite.
+ * New bits should be set in *mask for the pages we'successfully read.
+ *
+ * The offsets in request_lsns, buffers, and mask are linked.
+ */
+void
+communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno,
+						  neon_request_lsns *request_lsns,
+						  void **buffers, BlockNumber nblocks, const bits8 *mask)
+{
+	NeonResponse *resp;
+	uint64		ring_index;
+	PrfHashEntry *entry;
+	PrefetchRequest *slot;
+	PrefetchRequest hashkey;
+
+	Assert(PointerIsValid(request_lsns));
+	Assert(nblocks >= 1);
+
+	/*
+	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
+	 * correct alignment and that the padding bytes are cleared.
+	 */
+	memset(&hashkey.buftag, 0, sizeof(BufferTag));
+	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
+	hashkey.buftag.forkNum = forkNum;
+	hashkey.buftag.blockNum = base_blockno;
+
+	/*
+	 * The redo process does not lock pages that it needs to replay but are
+	 * not in the shared buffers, so a concurrent process may request the page
+	 * after redo has decided it won't redo that page and updated the LwLSN
+	 * for that page. If we're in hot standby we need to take care that we
+	 * don't return until after REDO has finished replaying up to that LwLSN,
+	 * as the page should have been locked up to that point.
+	 *
+	 * See also the description on neon_redo_read_buffer_filter below.
+	 *
+	 * NOTE: It is possible that the WAL redo process will still do IO due to
+	 * concurrent failed read IOs. Those IOs should never have a request_lsn
+	 * that is as large as the WAL record we're currently replaying, if it
+	 * weren't for the behaviour of the LwLsn cache that uses the highest
+	 * value of the LwLsn cache when the entry is not found.
+	 */
+	(void) prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false);
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		void	   *buffer = buffers[i];
+		BlockNumber blockno = base_blockno + i;
+		neon_request_lsns *reqlsns = &request_lsns[i];
+		TimestampTz		start_ts, end_ts;
+
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
+			continue;
+
+		start_ts = GetCurrentTimestamp();
+
+		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
+			XLogWaitForReplayOf(reqlsns->request_lsn);
+
+		/*
+		 * Try to find prefetched page in the list of received pages.
+		 */
+Retry:
+		hashkey.buftag.blockNum = blockno;
+		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+
+		if (entry != NULL)
+		{
+			slot = entry->slot;
+			if (neon_prefetch_response_usable(reqlsns, slot))
+			{
+				ring_index = slot->my_ring_index;
+			}
+			else
+			{
+				/*
+				 * Cannot use this prefetch, discard it
+				 *
+				 * We can't drop cache for not-yet-received requested items. It is
+				 * unlikely this happens, but it can happen if prefetch distance
+				 * is large enough and a backend didn't consume all prefetch
+				 * requests.
+				 */
+				if (slot->status == PRFS_REQUESTED)
+				{
+					if (!prefetch_wait_for(slot->my_ring_index))
+						goto Retry;
+				}
+				/* drop caches */
+				prefetch_set_unused(slot->my_ring_index);
+				pgBufferUsage.prefetch.expired += 1;
+				MyNeonCounters->getpage_prefetch_discards_total++;
+				/* make it look like a prefetch cache miss */
+				entry = NULL;
+			}
+		}
+
+		do
+		{
+			if (entry == NULL)
+			{
+				ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false);
+				Assert(ring_index != UINT64_MAX);
+				slot = GetPrfSlot(ring_index);
+			}
+			else
+			{
+				/*
+				 * Empty our reference to the prefetch buffer's hash entry. When
+				 * we wait for prefetches, the entry reference is invalidated by
+				 * potential updates to the hash, and when we reconnect to the
+				 * pageserver the prefetch we're waiting for may be dropped, in
+				 * which case we need to retry and take the branch above.
+				 */
+				entry = NULL;
+			}
+
+			Assert(slot->my_ring_index == ring_index);
+			Assert(MyPState->ring_last <= ring_index &&
+				   MyPState->ring_unused > ring_index);
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(GetPrfSlot(ring_index) == slot);
+
+		} while (!prefetch_wait_for(ring_index));
+
+		Assert(slot->status == PRFS_RECEIVED);
+		Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
+		Assert(hashkey.buftag.blockNum == base_blockno + i);
+
+		resp = slot->response;
+
+		switch (resp->tag)
+		{
+			case T_NeonGetPageResponse:
+			{
+				NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (resp->reqid != slot->reqid ||
+						resp->lsn != slot->request_lsns.request_lsn ||
+						resp->not_modified_since != slot->request_lsns.not_modified_since ||
+						!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
+						getpage_resp->req.forknum != forkNum ||
+						getpage_resp->req.blkno != base_blockno + i)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
+													slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
+					}
+				}
+				memcpy(buffer, getpage_resp->page, BLCKSZ);
+
+				/*
+				 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
+				 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
+				 * under buffer lock.
+				 */
+				if (!lfc_store_prefetch_result)
+					lfc_write(rinfo, forkNum, blockno, buffer);
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (resp->reqid != slot->reqid ||
+						resp->lsn != slot->request_lsns.request_lsn ||
+						resp->not_modified_since != slot->request_lsns.not_modified_since)
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
+								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+			default:
+				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
+		}
+
+		/* buffer was used, clean up for later reuse */
+		prefetch_set_unused(ring_index);
+		prefetch_cleanup_trailing_unused();
+
+		end_ts = GetCurrentTimestamp();
+		inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
+	}
+}
+
+/*
+ *	neon_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *request_lsns)
+{
+	NeonResponse *resp;
+	BlockNumber n_blocks;
+
+	{
+		NeonNblocksRequest request = {
+			.hdr.tag = T_NeonNblocksRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns->request_lsn,
+			.hdr.not_modified_since = request_lsns->not_modified_since,
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonNblocksResponse:
+			{
+				NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
+						relsize_resp->req.forknum != forknum)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
+					}
+				}
+				n_blocks = relsize_resp->n_blocks;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								resp->reqid,
+								RelFileInfoFmt(rinfo),
+								forknum,
+								LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
+											T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
+		}
+
+		pfree(resp);
+	}
+	return n_blocks;
+}
+
+/*
+ *	neon_db_size() -- Get the size of the database in bytes.
+ */
+int64
+communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
+{
+	NeonResponse *resp;
+	int64		db_size;
+
+	{
+		NeonDbSizeRequest request = {
+			.hdr.tag = T_NeonDbSizeRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns->request_lsn,
+			.hdr.not_modified_since = request_lsns->not_modified_since,
+			.dbNode = dbNode,
+		};
+
+		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonDbSizeResponse:
+			{
+				NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						dbsize_resp->req.dbNode != dbNode)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
+					}
+				}
+				db_size = dbsize_resp->db_size;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
+								resp->reqid,
+								dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
+											T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
+		}
+
+		pfree(resp);
+	}
+	return db_size;
+}
+
+int
+communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns,
+							   void *buffer)
+{
+	int			n_blocks;
+	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
+	NeonResponse *resp;
+	NeonGetSlruSegmentRequest request;
+
+	request = (NeonGetSlruSegmentRequest) {
+		.hdr.tag = T_NeonGetSlruSegmentRequest,
+		.hdr.reqid = GENERATE_REQUEST_ID(),
+		.hdr.lsn = request_lsns->request_lsn,
+		.hdr.not_modified_since = request_lsns->not_modified_since,
+		.kind = kind,
+		.segno = segno
+	};
+
+	do
+	{
+		while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
+
+		consume_prefetch_responses();
+
+		resp = page_server->receive(shard_no);
+	} while (resp == NULL);
+
+	switch (resp->tag)
+	{
+		case T_NeonGetSlruSegmentResponse:
+		{
+			NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
+			if (neon_protocol_version >= 3)
+			{
+				if (!equal_requests(resp, &request.hdr) ||
+					slru_resp->req.kind != kind ||
+					slru_resp->req.segno != segno)
+				{
+					NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
+												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
+												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
+				}
+			}
+			n_blocks = slru_resp->n_blocks;
+			memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
+			break;
+		}
+		case T_NeonErrorResponse:
+			if (neon_protocol_version >= 3)
+			{
+				if (!equal_requests(resp, &request.hdr))
+				{
+					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+				}
+			}
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X",
+							resp->reqid,
+							kind,
+							(unsigned long long) segno,
+							LSN_FORMAT_ARGS(request_lsns->request_lsn)),
+					 errdetail("page server returned error: %s",
+							   ((NeonErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
+										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
+	}
+	pfree(resp);
+
+	communicator_reconfigure_timeout_if_needed();
+	return n_blocks;
+}
+
+void
+communicator_reconfigure_timeout_if_needed(void)
+{
+	bool	needs_set = MyPState->ring_receive != MyPState->ring_unused &&
+						readahead_getpage_pull_timeout_ms > 0;
+
+	if (needs_set != timeout_set)
+	{
+		/* The background writer doens't (shouldn't) read any pages */
+		Assert(!AmBackgroundWriterProcess());
+		/* The checkpointer doens't (shouldn't) read any pages */
+		Assert(!AmCheckpointerProcess());
+
+		if (unlikely(PS_TIMEOUT_ID == 0))
+		{
+			PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler);
+		}
+
+		if (needs_set)
+		{
+#if PG_MAJORVERSION_NUM <= 14
+			enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms);
+#else
+			enable_timeout_every(
+				PS_TIMEOUT_ID,
+				TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
+											readahead_getpage_pull_timeout_ms),
+				readahead_getpage_pull_timeout_ms
+			);
+#endif
+			timeout_set = true;
+		}
+		else
+		{
+			Assert(timeout_set);
+			disable_timeout(PS_TIMEOUT_ID, false);
+			timeout_set = false;
+		}
+	}
+}
+
+static void
+pagestore_timeout_handler(void)
+{
+#if PG_MAJORVERSION_NUM <= 14
+	/*
+	 * PG14: Setting a repeating timeout is not possible, so we signal here
+	 * that the timeout has already been reset, and by telling the system
+	 * that system will re-schedule it later if we need to.
+	 */
+	timeout_set = false;
+#endif
+	timeout_signaled = true;
+	InterruptPending = true;
+}
+
+/*
+ * Process new data received in our active PageStream sockets.
+ *
+ * This relies on the invariant that all pipelined yet-to-be-received requests
+ * are getPage requests managed by MyPState. This is currently true, any
+ * modification will probably require some stuff to make it work again.
+ */
+static bool
+communicator_processinterrupts(void)
+{
+	if (timeout_signaled)
+	{
+		if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
+			communicator_prefetch_pump_state(true);
+
+		timeout_signaled = false;
+		communicator_reconfigure_timeout_if_needed();
+	}
+
+	if (!prev_interrupt_cb)
+		return false;
+
+	return prev_interrupt_cb();
+}
diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h
new file mode 100644
index 0000000000..72cba526c1
--- /dev/null
+++ b/pgxn/neon/communicator.h
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator.h
+ *	  internal interface for communicating with remote pageservers
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COMMUNICATOR_h
+#define COMMUNICATOR_h
+
+#include "neon_pgversioncompat.h"
+
+#include "storage/buf_internals.h"
+
+#include "pagestore_client.h"
+
+/* initialization at postmaster startup */
+extern void pg_init_communicator(void);
+
+/* initialization at backend startup */
+extern void communicator_init(void);
+
+extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum,
+								neon_request_lsns *request_lsns);
+extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum,
+										neon_request_lsns *request_lsns);
+extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns);
+extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
+									  BlockNumber base_blockno, neon_request_lsns *request_lsns,
+									  void **buffers, BlockNumber nblocks, const bits8 *mask);
+extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum,
+										 neon_request_lsns *lsns,
+										 BlockNumber nblocks, void **buffers, bits8 *mask);
+extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+												   BlockNumber nblocks, const bits8 *mask);
+extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
+										  neon_request_lsns *request_lsns,
+										  void *buffer);
+
+extern void communicator_reconfigure_timeout_if_needed(void);
+extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
+
+
+#endif
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 69da83f3fb..a6a7021756 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -28,6 +28,7 @@
 #include "utils/guc.h"
 #include "utils/guc_tables.h"
 
+#include "communicator.h"
 #include "extension_server.h"
 #include "file_cache.h"
 #include "neon.h"
@@ -439,7 +440,7 @@ _PG_init(void)
 	pg_init_walproposer();
 	init_lwlsncache();
 
-	pagestore_smgr_init();
+	pg_init_communicator();
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitUnstableExtensionsSupport();
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index a4339c9776..a2e81feb5f 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -59,7 +59,6 @@ extern uint32		WAIT_EVENT_NEON_WAL_DL;
 
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
-extern void pagestore_smgr_init(void);
 
 extern uint64 BackpressureThrottlingTime(void);
 extern void SetNeonCurrentClusterSize(uint64 size);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 6ddad21362..0ab539fe56 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -233,6 +233,7 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
 
+
 /*
  * LSN values associated with each request to the pageserver
  */
@@ -269,6 +270,10 @@ extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum,
 										 neon_request_lsns request_lsns, void *buffer);
 extern int64 neon_dbsize(Oid dbNode);
 
+extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
+								  BlockNumber blkno, neon_request_lsns *output,
+								  BlockNumber nblocks);
+
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 0a43f3a6a3..ef6bd038bb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -49,9 +49,6 @@
 #include "access/xlog_internal.h"
 #include "access/xlogutils.h"
 #include "catalog/pg_class.h"
-#include "common/hashfn.h"
-#include "executor/instrument.h"
-#include "libpq/pqformat.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/interrupt.h"
@@ -62,9 +59,9 @@
 #include "storage/fsm_internals.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
-#include "utils/timeout.h"
 
 #include "bitmap.h"
+#include "communicator.h"
 #include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
@@ -102,12 +99,6 @@ static char *hexdump_page(char *page);
 
 const int	SmgrTrace = DEBUG5;
 
-#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
-	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
-				   ##__VA_ARGS__)
-
-page_server_api *page_server;
-
 /* unlogged relation build states */
 typedef enum
 {
@@ -125,1685 +116,6 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block
 
 static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
 
-static uint32 local_request_counter;
-#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
-
-/*
- * Various settings related to prompt (fast) handling of PageStream responses
- * at any CHECK_FOR_INTERRUPTS point.
- */
-int				readahead_getpage_pull_timeout_ms = 0;
-static int		PS_TIMEOUT_ID = 0;
-static bool		timeout_set = false;
-static bool		timeout_signaled = false;
-
-/*
- * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want
- * that to handle any getpage responses if we're already working on the
- * backlog of those, as we'd hit issues with determining which prefetch slot
- * we just got a response for.
- *
- * To protect against that, we have this variable that's set whenever we start
- * receiving data for prefetch slots, so that we don't get confused.
- *
- * Note that in certain error cases during readpage we may leak r_r_g=true,
- * which results in a failure to pick up further responses until we first
- * actively try to receive new getpage responses.
- */
-static bool		readpage_reentrant_guard = false;
-
-static void reconfigure_timeout_if_needed(void);
-static void pagestore_timeout_handler(void);
-
-#define START_PREFETCH_RECEIVE_WORK() \
-	do { \
-		readpage_reentrant_guard = true; \
-	} while (false)
-
-#define END_PREFETCH_RECEIVE_WORK() \
-	do { \
-		readpage_reentrant_guard = false; \
-		if (unlikely(timeout_signaled && !InterruptPending)) \
-			InterruptPending = true; \
-	} while (false)
-
-/*
- * Prefetch implementation:
- *
- * Prefetch is performed locally by each backend.
- *
- * There can be up to readahead_buffer_size active IO requests registered at
- * any time. Requests using smgr_prefetch are sent to the pageserver, but we
- * don't wait on the response. Requests using smgr_read are either read from
- * the buffer, or (if that's not possible) we wait on the response to arrive -
- * this also will allow us to receive other prefetched pages.
- * Each request is immediately written to the output buffer of the pageserver
- * connection, but may not be flushed if smgr_prefetch is used: pageserver
- * flushes sent requests on manual flush, or every neon.flush_output_after
- * unflushed requests; which is not necessarily always and all the time.
- *
- * Once we have received a response, this value will be stored in the response
- * buffer, indexed in a hash table. This allows us to retain our buffered
- * prefetch responses even when we have cache misses.
- *
- * Reading of prefetch responses is delayed until them are actually needed
- * (smgr_read). In case of prefetch miss or any other SMGR request other than
- * smgr_read, all prefetch responses in the pipeline will need to be read from
- * the connection; the responses are stored for later use.
- *
- * NOTE: The current implementation of the prefetch system implements a ring
- * buffer of up to readahead_buffer_size requests. If there are more _read and
- * _prefetch requests between the initial _prefetch and the _read of a buffer,
- * the prefetch request will have been dropped from this prefetch buffer, and
- * your prefetch was wasted.
- */
-
-/*
- * State machine:
- *
- * not in hash : in hash
- *             :
- * UNUSED ------> REQUESTED --> RECEIVED
- *   ^         :      |            |
- *   |         :      v            |
- *   |         : TAG_REMAINS       |
- *   |         :      |            |
- *   +----------------+------------+
- *             :
- */
-typedef enum PrefetchStatus
-{
-	PRFS_UNUSED = 0,			/* unused slot */
-	PRFS_REQUESTED,				/* request was written to the sendbuffer to
-								 * PS, but not necessarily flushed. all fields
-								 * except response valid */
-	PRFS_RECEIVED,				/* all fields valid */
-	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
-								 * valid */
-} PrefetchStatus;
-
-/* must fit in uint8; bits 0x1 are used */
-typedef enum {
-	PRFSF_NONE	= 0x0,
-	PRFSF_LFC	= 0x1  /* received prefetch result is stored in LFC */
-} PrefetchRequestFlags;
-
-typedef struct PrefetchRequest
-{
-	BufferTag	buftag;			/* must be first entry in the struct */
-	shardno_t	shard_no;
-	uint8		status;		/* see PrefetchStatus for valid values */
-	uint8		flags;		/* see PrefetchRequestFlags */
-	neon_request_lsns request_lsns;
-	NeonRequestId reqid;
-	NeonResponse *response;		/* may be null */
-	uint64		my_ring_index;
-} PrefetchRequest;
-
-/* prefetch buffer lookup hash table */
-
-typedef struct PrfHashEntry
-{
-	PrefetchRequest *slot;
-	uint32		status;
-	uint32		hash;
-} PrfHashEntry;
-
-#define SH_PREFIX			prfh
-#define SH_ELEMENT_TYPE		PrfHashEntry
-#define SH_KEY_TYPE			PrefetchRequest *
-#define SH_KEY				slot
-#define SH_STORE_HASH
-#define SH_GET_HASH(tb, a)	((a)->hash)
-#define SH_HASH_KEY(tb, key) hash_bytes( \
-	((const unsigned char *) &(key)->buftag), \
-	sizeof(BufferTag) \
-)
-
-#define SH_EQUAL(tb, a, b)	(BufferTagsEqual(&(a)->buftag, &(b)->buftag))
-#define SH_SCOPE			static inline
-#define SH_DEFINE
-#define SH_DECLARE
-#include "lib/simplehash.h"
-
-/*
- * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
- * It maintains a (ring) buffer of in-flight requests and responses.
- *
- * We maintain several indexes into the ring buffer:
- * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
- *
- * ring_unused points to the first unused slot of the buffer
- * ring_receive is the next request that is to be received
- * ring_last is the oldest received entry in the buffer
- *
- * Apart from being an entry in the ring buffer of prefetch requests, each
- * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
- */
-typedef struct PrefetchState
-{
-	MemoryContext bufctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext errctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext hashctx;		/* context for prf_buffer */
-
-	/* buffer indexes */
-	uint64		ring_unused;	/* first unused slot */
-	uint64		ring_flush;		/* next request to flush */
-	uint64		ring_receive;	/* next slot that is to receive a response */
-	uint64		ring_last;		/* min slot with a response value */
-
-	/* metrics / statistics  */
-	int			n_responses_buffered;	/* count of PS responses not yet in
-										 * buffers */
-	int			n_requests_inflight;	/* count of PS requests considered in
-										 * flight */
-	int			n_unused;		/* count of buffers < unused, > last, that are
-								 * also unused */
-
-	/* the buffers */
-	prfh_hash	*prf_hash;
-	int			max_shard_no;
-	/* Mark shards involved in prefetch */
-	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
-	PrefetchRequest prf_buffer[];	/* prefetch buffers */
-} PrefetchState;
-
-static PrefetchState *MyPState;
-
-#define GetPrfSlotNoCheck(ring_index) ( \
-	&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
-)
-
-#define GetPrfSlot(ring_index) ( \
-	( \
-		AssertMacro((ring_index) < MyPState->ring_unused && \
-					(ring_index) >= MyPState->ring_last), \
-		GetPrfSlotNoCheck(ring_index) \
-	) \
-)
-
-#define ReceiveBufferNeedsCompaction() (\
-	(MyPState->n_responses_buffered / 8) < ( \
-		MyPState->ring_receive - \
-			MyPState->ring_last - \
-			MyPState->n_responses_buffered \
-	) \
-)
-
-static bool compact_prefetch_buffers(void);
-static void consume_prefetch_responses(void);
-static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
-static bool prefetch_wait_for(uint64 ring_index);
-static void prefetch_cleanup_trailing_unused(void);
-static inline void prefetch_set_unused(uint64 ring_index);
-
-static void
-neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
-					  BlockNumber blkno, neon_request_lsns *output,
-					  BlockNumber nblocks);
-static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns,
-										  PrefetchRequest *slot);
-
-static bool
-compact_prefetch_buffers(void)
-{
-	uint64		empty_ring_index = MyPState->ring_last;
-	uint64		search_ring_index = MyPState->ring_receive;
-	int			n_moved = 0;
-
-	if (MyPState->ring_receive == MyPState->ring_last)
-		return false;
-
-	while (search_ring_index > MyPState->ring_last)
-	{
-		search_ring_index--;
-		if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
-		{
-			empty_ring_index = search_ring_index;
-			break;
-		}
-	}
-
-	/*
-	 * Here we have established: slots < search_ring_index have an unknown
-	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
-	 * are unused slots > empty_ring_index are in use, or outside our buffer's
-	 * range. ... unless search_ring_index <= ring_last
-	 *
-	 * Therefore, there is a gap of at least one unused items between
-	 * search_ring_index and empty_ring_index (both inclusive), which grows as
-	 * we hit more unused items while moving backwards through the array.
-	 */
-
-	while (search_ring_index > MyPState->ring_last)
-	{
-		PrefetchRequest *source_slot;
-		PrefetchRequest *target_slot;
-		bool		found;
-
-		/* update search index to an unprocessed entry */
-		search_ring_index--;
-
-		source_slot = GetPrfSlot(search_ring_index);
-
-		if (source_slot->status == PRFS_UNUSED)
-			continue;
-
-		/* slot is used -- start moving slot */
-		target_slot = GetPrfSlot(empty_ring_index);
-
-		Assert(source_slot->status == PRFS_RECEIVED);
-		Assert(target_slot->status == PRFS_UNUSED);
-
-		target_slot->buftag = source_slot->buftag;
-		target_slot->shard_no = source_slot->shard_no;
-		target_slot->status = source_slot->status;
-		target_slot->flags = source_slot->flags;
-		target_slot->response = source_slot->response;
-		target_slot->reqid = source_slot->reqid;
-		target_slot->request_lsns = source_slot->request_lsns;
-		target_slot->my_ring_index = empty_ring_index;
-
-		prfh_delete(MyPState->prf_hash, source_slot);
-		prfh_insert(MyPState->prf_hash, target_slot, &found);
-
-		Assert(!found);
-
-		/* Adjust the location of our known-empty slot */
-		empty_ring_index--;
-
-		/* empty the moved slot */
-		source_slot->status = PRFS_UNUSED;
-		source_slot->buftag = (BufferTag)
-		{
-			0
-		};
-		source_slot->response = NULL;
-		source_slot->my_ring_index = 0;
-		source_slot->request_lsns = (neon_request_lsns) {
-			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
-		};
-
-		/* update bookkeeping */
-		n_moved++;
-	}
-
-	/*
-	 * Only when we've moved slots we can expect trailing unused slots, so
-	 * only then we clean up trailing unused slots.
-	 */
-	if (n_moved > 0)
-	{
-		prefetch_cleanup_trailing_unused();
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * If there might be responses still in the TCP buffer, then we should try to
- * use those, to reduce any TCP backpressure on the OS/PS side.
- *
- * This procedure handles that.
- *
- * Note that this works because we don't pipeline non-getPage requests.
- *
- * NOTE: This procedure is not allowed to throw errors that should be handled
- * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS
- * point inside and outside PostgreSQL.
- *
- * This still does throw errors when it receives malformed responses from PS.
- *
- * When we're not called from CHECK_FOR_INTERRUPTS (indicated by
- * IsHandlingInterrupts) we also report we've ended prefetch receive work,
- * just in case state tracking was lost due to an error in the sync getPage
- * response code.
- */
-static void
-prefetch_pump_state(bool IsHandlingInterrupts)
-{
-	while (MyPState->ring_receive != MyPState->ring_flush)
-	{
-		NeonResponse   *response;
-		PrefetchRequest *slot;
-		MemoryContext	old;
-
-		slot = GetPrfSlot(MyPState->ring_receive);
-
-		old = MemoryContextSwitchTo(MyPState->errctx);
-		response = page_server->try_receive(slot->shard_no);
-		MemoryContextSwitchTo(old);
-
-		if (response == NULL)
-			break;
-
-		/* The slot should still be valid */
-		if (slot->status != PRFS_REQUESTED ||
-			slot->response != NULL ||
-			slot->my_ring_index != MyPState->ring_receive)
-			neon_shard_log(slot->shard_no, ERROR,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
-						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
-
-		/* update prefetch state */
-		MyPState->n_responses_buffered += 1;
-		MyPState->n_requests_inflight -= 1;
-		MyPState->ring_receive += 1;
-		MyNeonCounters->getpage_prefetches_buffered =
-			MyPState->n_responses_buffered;
-
-		/* update slot state */
-		slot->status = PRFS_RECEIVED;
-		slot->response = response;
-
-		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
-		{
-			/*
-			 * Store prefetched result in LFC (please read comments to lfc_prefetch
-			 * explaining why it can be done without holding shared buffer lock
-			 */
-			if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
-			{
-				slot->flags |= PRFSF_LFC;
-			}
-		}
-	}
-
-	/* We never pump the prefetch state while handling other pages */
-	if (!IsHandlingInterrupts)
-		END_PREFETCH_RECEIVE_WORK();
-
-	reconfigure_timeout_if_needed();
-}
-
-void
-readahead_buffer_resize(int newsize, void *extra)
-{
-	uint64		end,
-				nfree = newsize;
-	PrefetchState *newPState;
-	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) +
-		(sizeof(PrefetchRequest) * newsize);
-
-	/* don't try to re-initialize if we haven't initialized yet */
-	if (MyPState == NULL)
-		return;
-
-	/*
-	 * Make sure that we don't lose track of active prefetch requests by
-	 * ensuring we have received all but the last n requests (n = newsize).
-	 */
-	if (MyPState->n_requests_inflight > newsize)
-	{
-		prefetch_wait_for(MyPState->ring_unused - newsize - 1);
-		Assert(MyPState->n_requests_inflight <= newsize);
-	}
-
-	/* construct the new PrefetchState, and copy over the memory contexts */
-	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
-
-	newPState->bufctx = MyPState->bufctx;
-	newPState->errctx = MyPState->errctx;
-	newPState->hashctx = MyPState->hashctx;
-	newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
-	newPState->n_unused = newsize;
-	newPState->n_requests_inflight = 0;
-	newPState->n_responses_buffered = 0;
-	newPState->ring_last = newsize;
-	newPState->ring_unused = newsize;
-	newPState->ring_receive = newsize;
-	newPState->max_shard_no = MyPState->max_shard_no;
-	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
-
-	/*
-	 * Copy over the prefetches.
-	 *
-	 * We populate the prefetch array from the end; to retain the most recent
-	 * prefetches, but this has the benefit of only needing to do one
-	 * iteration on the dataset, and trivial compaction.
-	 */
-	for (end = MyPState->ring_unused - 1;
-		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
-		 end -= 1)
-	{
-		PrefetchRequest *slot = GetPrfSlot(end);
-		PrefetchRequest *newslot;
-		bool		found;
-
-		if (slot->status == PRFS_UNUSED)
-			continue;
-
-		nfree -= 1;
-
-		newslot = &newPState->prf_buffer[nfree];
-		*newslot = *slot;
-		newslot->my_ring_index = nfree;
-
-		prfh_insert(newPState->prf_hash, newslot, &found);
-
-		Assert(!found);
-
-		switch (newslot->status)
-		{
-			case PRFS_UNUSED:
-				pg_unreachable();
-			case PRFS_REQUESTED:
-				newPState->n_requests_inflight += 1;
-				newPState->ring_receive -= 1;
-				newPState->ring_last -= 1;
-				break;
-			case PRFS_RECEIVED:
-				newPState->n_responses_buffered += 1;
-				newPState->ring_last -= 1;
-				break;
-			case PRFS_TAG_REMAINS:
-				newPState->ring_last -= 1;
-				break;
-		}
-		newPState->n_unused -= 1;
-	}
-	newPState->ring_flush = newPState->ring_receive;
-
-	MyNeonCounters->getpage_prefetches_buffered =
-		MyPState->n_responses_buffered;
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->n_requests_inflight;
-
-	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
-	{
-		PrefetchRequest *slot = GetPrfSlot(end);
-		Assert(slot->status != PRFS_REQUESTED);
-		if (slot->status == PRFS_RECEIVED)
-		{
-			pfree(slot->response);
-		}
-	}
-
-	prfh_destroy(MyPState->prf_hash);
-	pfree(MyPState);
-	MyPState = newPState;
-}
-
-
-
-/*
- * Make sure that there are no responses still in the buffer.
- *
- * This function may indirectly update MyPState->pfs_hash; which invalidates
- * any active pointers into the hash table.
- */
-static void
-consume_prefetch_responses(void)
-{
-	if (MyPState->ring_receive < MyPState->ring_unused)
-		prefetch_wait_for(MyPState->ring_unused - 1);
-}
-
-static void
-prefetch_cleanup_trailing_unused(void)
-{
-	uint64		ring_index;
-	PrefetchRequest *slot;
-
-	while (MyPState->ring_last < MyPState->ring_receive)
-	{
-		ring_index = MyPState->ring_last;
-		slot = GetPrfSlot(ring_index);
-
-		if (slot->status == PRFS_UNUSED)
-			MyPState->ring_last += 1;
-		else
-			break;
-	}
-}
-
-
-static bool
-prefetch_flush_requests(void)
-{
-	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
-	{
-		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
-		{
-			if (!page_server->flush(shard_no))
-				return false;
-			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
-		}
-	}
-	MyPState->max_shard_no = 0;
-	return true;
-}
-
-/*
- * Wait for slot of ring_index to have received its response.
- * The caller is responsible for making sure the request buffer is flushed.
- *
- * NOTE: this function may indirectly update MyPState->pfs_hash; which
- * invalidates any active pointers into the hash table.
- * NOTE: callers should make sure they can handle query cancellations in this
- * function's call path.
- */
-static bool
-prefetch_wait_for(uint64 ring_index)
-{
-	PrefetchRequest *entry;
-	bool		result = true;
-
-	if (MyPState->ring_flush <= ring_index &&
-		MyPState->ring_unused > MyPState->ring_flush)
-	{
-		if (!prefetch_flush_requests())
-			return false;
-		MyPState->ring_flush = MyPState->ring_unused;
-	}
-
-	Assert(MyPState->ring_unused > ring_index);
-
-	while (MyPState->ring_receive <= ring_index)
-	{
-		START_PREFETCH_RECEIVE_WORK();
-		entry = GetPrfSlot(MyPState->ring_receive);
-
-		Assert(entry->status == PRFS_REQUESTED);
-		if (!prefetch_read(entry))
-		{
-			result = false;
-			break;
-		}
-
-		END_PREFETCH_RECEIVE_WORK();
-		CHECK_FOR_INTERRUPTS();
-	}
-
-	return result;
-}
-
-/*
- * Read the response of a prefetch request into its slot.
- *
- * The caller is responsible for making sure that the request for this buffer
- * was flushed to the PageServer.
- *
- * NOTE: this function may indirectly update MyPState->pfs_hash; which
- * invalidates any active pointers into the hash table.
- *
- * NOTE: this does IO, and can get canceled out-of-line.
- */
-static bool
-prefetch_read(PrefetchRequest *slot)
-{
-	NeonResponse *response;
-	MemoryContext old;
-	BufferTag	buftag;
-	shardno_t	shard_no;
-	uint64		my_ring_index;
-
-	Assert(slot->status == PRFS_REQUESTED);
-	Assert(slot->response == NULL);
-	Assert(slot->my_ring_index == MyPState->ring_receive);
-
-	if (slot->status != PRFS_REQUESTED ||
-		slot->response != NULL ||
-		slot->my_ring_index != MyPState->ring_receive)
-		neon_shard_log(slot->shard_no, ERROR,
-					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
-					   slot->status, slot->response,
-					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
-
-	/*
-	 * Copy the request info so that if an error happens and the prefetch
-	 * queue is flushed during the receive call, we can print the original
-	 * values in the error message
-	 */
-	buftag = slot->buftag;
-	shard_no = slot->shard_no;
-	my_ring_index = slot->my_ring_index;
-
-	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive(shard_no);
-	MemoryContextSwitchTo(old);
-	if (response)
-	{
-		/* The slot should still be valid */
-		if (slot->status != PRFS_REQUESTED ||
-			slot->response != NULL ||
-			slot->my_ring_index != MyPState->ring_receive)
-			neon_shard_log(shard_no, ERROR,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
-						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
-
-		/* update prefetch state */
-		MyPState->n_responses_buffered += 1;
-		MyPState->n_requests_inflight -= 1;
-		MyPState->ring_receive += 1;
-		MyNeonCounters->getpage_prefetches_buffered =
-			MyPState->n_responses_buffered;
-
-		/* update slot state */
-		slot->status = PRFS_RECEIVED;
-		slot->response = response;
-
-		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
-		{
-			/*
-			 * Store prefetched result in LFC (please read comments to lfc_prefetch
-			 * explaining why it can be done without holding shared buffer lock
-			 */
-			if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
-			{
-				slot->flags |= PRFSF_LFC;
-			}
-		}
-		return true;
-	}
-	else
-	{
-		/*
-		 * Note: The slot might no longer be valid, if the connection was lost
-		 * and the prefetch queue was flushed during the receive call
-		 */
-		neon_shard_log(shard_no, LOG,
-					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long) my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
-					   buftag.forkNum, buftag.blockNum);
-		return false;
-	}
-}
-
-/*
- * Disconnect hook - drop prefetches when the connection drops
- *
- * If we don't remove the failed prefetches, we'd be serving incorrect
- * data to the smgr.
- */
-void
-prefetch_on_ps_disconnect(void)
-{
-	MyPState->ring_flush = MyPState->ring_unused;
-
-	while (MyPState->ring_receive < MyPState->ring_unused)
-	{
-		PrefetchRequest *slot;
-		uint64		ring_index = MyPState->ring_receive;
-
-		slot = GetPrfSlot(ring_index);
-
-		Assert(slot->status == PRFS_REQUESTED);
-		Assert(slot->my_ring_index == ring_index);
-
-		/*
-		 * Drop connection to all shards which have prefetch requests.
-		 * It is not a problem to call disconnect multiple times on the same connection
-		 * because disconnect implementation in libpagestore.c will check if connection
-		 * is alive and do nothing of connection was already dropped.
-		 */
-		page_server->disconnect(slot->shard_no);
-
-		/* clean up the request */
-		slot->status = PRFS_TAG_REMAINS;
-		MyPState->n_requests_inflight -= 1;
-		MyPState->ring_receive += 1;
-
-		prefetch_set_unused(ring_index);
-		pgBufferUsage.prefetch.expired += 1;
-		MyNeonCounters->getpage_prefetch_discards_total += 1;
-	}
-
-	/*
-	 * We can have gone into retry due to network error, so update stats with
-	 * the latest available
-	 */
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->n_requests_inflight;
-	MyNeonCounters->getpage_prefetches_buffered =
-		MyPState->n_responses_buffered;
-}
-
-/*
- * prefetch_set_unused() - clear a received prefetch slot
- *
- * The slot at ring_index must be a current member of the ring buffer,
- * and may not be in the PRFS_REQUESTED state.
- *
- * NOTE: this function will update MyPState->pfs_hash; which invalidates any
- * active pointers into the hash table.
- */
-static inline void
-prefetch_set_unused(uint64 ring_index)
-{
-	PrefetchRequest *slot;
-
-	if (ring_index < MyPState->ring_last)
-		return;					/* Should already be unused */
-
-	slot = GetPrfSlot(ring_index);
-	if (slot->status == PRFS_UNUSED)
-		return;
-
-	Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
-
-	if (slot->status == PRFS_RECEIVED)
-	{
-		pfree(slot->response);
-		slot->response = NULL;
-
-		MyPState->n_responses_buffered -= 1;
-		MyPState->n_unused += 1;
-
-		MyNeonCounters->getpage_prefetches_buffered =
-			MyPState->n_responses_buffered;
-	}
-	else
-	{
-		Assert(slot->response == NULL);
-	}
-
-	prfh_delete(MyPState->prf_hash, slot);
-
-	/* clear all fields */
-	MemSet(slot, 0, sizeof(PrefetchRequest));
-	slot->status = PRFS_UNUSED;
-
-	/* run cleanup if we're holding back ring_last */
-	if (MyPState->ring_last == ring_index)
-		prefetch_cleanup_trailing_unused();
-
-	/*
-	 * ... and try to store the buffered responses more compactly if > 12.5%
-	 * of the buffer is gaps
-	 */
-	else if (ReceiveBufferNeedsCompaction())
-		compact_prefetch_buffers();
-}
-
-/*
- * Send one prefetch request to the pageserver. To wait for the response, call
- * prefetch_wait_for().
- */
-static void
-prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
-{
-	bool		found;
-	uint64		mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
-
-	NeonGetPageRequest request = {
-		.hdr.tag = T_NeonGetPageRequest,
-		.hdr.reqid = GENERATE_REQUEST_ID(),
-		/* lsn and not_modified_since are filled in below */
-		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
-		.forknum = slot->buftag.forkNum,
-		.blkno = slot->buftag.blockNum,
-	};
-
-	Assert(mySlotNo == MyPState->ring_unused);
-
-	slot->reqid = request.hdr.reqid;
-
-	if (force_request_lsns)
-		slot->request_lsns = *force_request_lsns;
-	else
-		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
-							  slot->buftag.forkNum, slot->buftag.blockNum,
-							  &slot->request_lsns, 1);
-	request.hdr.lsn = slot->request_lsns.request_lsn;
-	request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
-
-	Assert(slot->response == NULL);
-	Assert(slot->my_ring_index == MyPState->ring_unused);
-
-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
-	{
-		Assert(mySlotNo == MyPState->ring_unused);
-		/* loop */
-	}
-
-	/* update prefetch state */
-	MyPState->n_requests_inflight += 1;
-	MyPState->n_unused -= 1;
-	MyPState->ring_unused += 1;
-	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
-	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
-
-	/* update slot state */
-	slot->status = PRFS_REQUESTED;
-	prfh_insert(MyPState->prf_hash, slot, &found);
-	Assert(!found);
-}
-
-/*
- * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted.
- * Present pages are marked in "mask" bitmap and total number of such pages is returned.
- */
-static int
-prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns,
-				 BlockNumber nblocks, void **buffers, bits8 *mask)
-{
-	int hits = 0;
-	PrefetchRequest hashkey;
-
-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
-	hashkey.buftag.forkNum = forknum;
-
-	for (int i = 0; i < nblocks; i++)
-	{
-		PrfHashEntry *entry;
-
-		hashkey.buftag.blockNum = blocknum + i;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
-
-		if (entry != NULL)
-		{
-			PrefetchRequest *slot = entry->slot;
-			uint64 ring_index = slot->my_ring_index;
-			Assert(slot == GetPrfSlot(ring_index));
-
-			Assert(slot->status != PRFS_UNUSED);
-			Assert(MyPState->ring_last <= ring_index &&
-				   ring_index < MyPState->ring_unused);
-			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
-
-			if (slot->status != PRFS_RECEIVED)
-				continue;
-
-			/*
-			 * If the caller specified a request LSN to use, only accept
-			 * prefetch responses that satisfy that request.
-			 */
-			if (!neon_prefetch_response_usable(&lsns[i], slot))
-				continue;
-
-			/*
-			 * Ignore errors
-			 */
-			if (slot->response->tag != T_NeonGetPageResponse)
-			{
-				if (slot->response->tag != T_NeonErrorResponse)
-				{
-					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
-				}
-				continue;
-			}
-			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
-
-
-			/*
-			 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
-			 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
-			 * under buffer lock.
-			 */
-			if (!lfc_store_prefetch_result)
-				lfc_write(rinfo, forknum, blocknum + i, buffers[i]);
-
-			prefetch_set_unused(ring_index);
-			BITMAP_SET(mask, i);
-
-			hits += 1;
-			inc_getpage_wait(0);
-		}
-	}
-	pgBufferUsage.prefetch.hits += hits;
-	return hits;
-}
-
-#if PG_MAJORVERSION_NUM < 17
-static bool
-prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer)
-{
-	bits8 present = 0;
-	return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0;
-}
-#endif
-
-/*
- * prefetch_register_bufferv() - register and prefetch buffers
- *
- * Register that we may want the contents of BufferTag in the near future.
- * This is used when issuing a speculative prefetch request, but also when
- * performing a synchronous request and need the buffer right now.
- *
- * If force_request_lsns is not NULL, those values are sent to the
- * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
- * to calculate the LSNs to send.
- *
- * Bits set in *mask (if present) indicate pages already read; i.e. pages we
- * can skip in this process.
- *
- * When performing a prefetch rather than a synchronous request,
- * is_prefetch==true. Currently, it only affects how the request is accounted
- * in the perf counters.
- *
- * NOTE: this function may indirectly update MyPState->pfs_hash; which
- * invalidates any active pointers into the hash table.
- */
-static uint64
-prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
-						  BlockNumber nblocks, const bits8 *mask,
-						  bool is_prefetch)
-{
-	uint64		min_ring_index;
-	PrefetchRequest hashkey;
-#ifdef USE_ASSERT_CHECKING
-	bool		any_hits = false;
-#endif
-	/* We will never read further ahead than our buffer can store. */
-	nblocks = Max(1, Min(nblocks, readahead_buffer_size));
-
-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	hashkey.buftag = tag;
-
-Retry:
-	/*
-	 * We can have gone into retry due to network error, so update stats with
-	 * the latest available
-	 */
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->ring_unused - MyPState->ring_receive;
-	MyNeonCounters->getpage_prefetches_buffered =
-		MyPState->n_responses_buffered;
-
-	min_ring_index = UINT64_MAX;
-	for (int i = 0; i < nblocks; i++)
-	{
-		PrefetchRequest *slot = NULL;
-		PrfHashEntry *entry = NULL;
-		uint64		ring_index;
-		neon_request_lsns *lsns;
-
-		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
-			continue;
-
-		if (frlsns)
-			lsns = &frlsns[i];
-		else
-			lsns = NULL;
-
-#ifdef USE_ASSERT_CHECKING
-		any_hits = true;
-#endif
-
-		slot = NULL;
-		entry = NULL;
-
-		hashkey.buftag.blockNum = tag.blockNum + i;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
-
-		if (entry != NULL)
-		{
-			slot = entry->slot;
-			ring_index = slot->my_ring_index;
-			Assert(slot == GetPrfSlot(ring_index));
-
-			Assert(slot->status != PRFS_UNUSED);
-			Assert(MyPState->ring_last <= ring_index &&
-				   ring_index < MyPState->ring_unused);
-			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
-
-			/*
-			 * If the caller specified a request LSN to use, only accept
-			 * prefetch responses that satisfy that request.
-			 */
-			if (lsns)
-			{
-				if (!neon_prefetch_response_usable(lsns, slot))
-				{
-					/* Wait for the old request to finish and discard it */
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-					slot = NULL;
-					pgBufferUsage.prefetch.expired += 1;
-					MyNeonCounters->getpage_prefetch_discards_total += 1;
-				}
-			}
-
-			if (entry != NULL)
-			{
-				/*
-				 * We received a prefetch for a page that was recently read
-				 * and removed from the buffers. Remove that request from the
-				 * buffers.
-				 */
-				if (slot->status == PRFS_TAG_REMAINS)
-				{
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-					slot = NULL;
-				}
-				else
-				{
-					min_ring_index = Min(min_ring_index, ring_index);
-					/* The buffered request is good enough, return that index */
-					if (is_prefetch)
-						pgBufferUsage.prefetch.duplicates++;
-					continue;
-				}
-			}
-		}
-		else if (!is_prefetch)
-		{
-			pgBufferUsage.prefetch.misses += 1;
-			MyNeonCounters->getpage_prefetch_misses_total++;
-		}
-		/*
-		 * We can only leave the block above by finding that there's
-		 * no entry that can satisfy this request, either because there
-		 * was no entry, or because the entry was invalid or didn't satisfy
-		 * the LSNs provided.
-		 *
-		 * The code should've made sure to clear up the data.
-		 */
-		Assert(entry == NULL);
-		Assert(slot == NULL);
-
-		/* There should be no buffer overflow */
-		Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused);
-
-		/*
-		 * If the prefetch queue is full, we need to make room by clearing the
-		 * oldest slot. If the oldest slot holds a buffer that was already
-		 * received, we can just throw it away; we fetched the page
-		 * unnecessarily in that case. If the oldest slot holds a request that
-		 * we haven't received a response for yet, we have to wait for the
-		 * response to that before we can continue. We might not have even
-		 * flushed the request to the pageserver yet, it might be just sitting
-		 * in the output buffer. In that case, we flush it and wait for the
-		 * response. (We could decide not to send it, but it's hard to abort
-		 * when the request is already in the output buffer, and 'not sending'
-		 * a prefetch request kind of goes against the principles of
-		 * prefetching)
-		 */
-		if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused)
-		{
-			uint64		cleanup_index = MyPState->ring_last;
-
-			slot = GetPrfSlot(cleanup_index);
-
-			Assert(slot->status != PRFS_UNUSED);
-
-			/*
-			 * If there is good reason to run compaction on the prefetch buffers,
-			 * try to do that.
-			 */
-			if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
-			{
-				Assert(slot->status == PRFS_UNUSED);
-			}
-			else
-			{
-				/*
-				 * We have the slot for ring_last, so that must still be in
-				 * progress
-				 */
-				switch (slot->status)
-				{
-					case PRFS_REQUESTED:
-						Assert(MyPState->ring_receive == cleanup_index);
-						if (!prefetch_wait_for(cleanup_index))
-							goto Retry;
-						prefetch_set_unused(cleanup_index);
-						pgBufferUsage.prefetch.expired += 1;
-						MyNeonCounters->getpage_prefetch_discards_total += 1;
-						break;
-					case PRFS_RECEIVED:
-					case PRFS_TAG_REMAINS:
-						prefetch_set_unused(cleanup_index);
-						pgBufferUsage.prefetch.expired += 1;
-						MyNeonCounters->getpage_prefetch_discards_total += 1;
-						break;
-					default:
-						pg_unreachable();
-				}
-			}
-		}
-
-		/*
-		 * The next buffer pointed to by `ring_unused` is now definitely empty, so
-		 * we can insert the new request to it.
-		 */
-		ring_index = MyPState->ring_unused;
-
-		Assert(MyPState->ring_last <= ring_index &&
-			   ring_index <= MyPState->ring_unused);
-
-		slot = GetPrfSlotNoCheck(ring_index);
-
-		Assert(slot->status == PRFS_UNUSED);
-
-		/*
-		 * We must update the slot data before insertion, because the hash
-		 * function reads the buffer tag from the slot.
-		 */
-		slot->buftag = hashkey.buftag;
-		slot->shard_no = get_shard_number(&tag);
-		slot->my_ring_index = ring_index;
-		slot->flags = 0;
-
-		min_ring_index = Min(min_ring_index, ring_index);
-
-		if (is_prefetch)
-			MyNeonCounters->getpage_prefetch_requests_total++;
-		else
-			MyNeonCounters->getpage_sync_requests_total++;
-
-		prefetch_do_request(slot, lsns);
-	}
-
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->ring_unused - MyPState->ring_receive;
-
-	Assert(any_hits);
-
-	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
-		   GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
-	Assert(MyPState->ring_last <= min_ring_index &&
-		   min_ring_index < MyPState->ring_unused);
-
-	if (flush_every_n_requests > 0 &&
-		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
-	{
-		if (!prefetch_flush_requests())
-		{
-			/*
-			 * Prefetch set is reset in case of error, so we should try to
-			 * register our request once again
-			 */
-			goto Retry;
-		}
-		MyPState->ring_flush = MyPState->ring_unused;
-	}
-
-	return min_ring_index;
-}
-
-static bool
-equal_requests(NeonRequest* a, NeonRequest* b)
-{
-	return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
-}
-
-
-/*
- * Note: this function can get canceled and use a long jump to the next catch
- * context. Take care.
- */
-static NeonResponse *
-page_server_request(void const *req)
-{
-	NeonResponse *resp;
-	BufferTag tag = {0};
-	shardno_t shard_no;
-
-	switch (messageTag(req))
-	{
-		case T_NeonExistsRequest:
-			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
-			break;
-		case T_NeonNblocksRequest:
-			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
-			break;
-		case T_NeonDbSizeRequest:
-			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
-			break;
-		case T_NeonGetPageRequest:
-			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
-			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
-			break;
-		default:
-			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
-	}
-	shard_no = get_shard_number(&tag);
-
-	/*
-	 * Current sharding model assumes that all metadata is present only at shard 0.
-	 * We still need to call get_shard_no() to check if shard map is up-to-date.
-	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest)
-	{
-		shard_no = 0;
-	}
-
-	do
-	{
-		PG_TRY();
-		{
-			while (!page_server->send(shard_no, (NeonRequest *) req)
-				   || !page_server->flush(shard_no))
-			{
-				/* do nothing */
-			}
-			MyNeonCounters->pageserver_open_requests++;
-			consume_prefetch_responses();
-			resp = page_server->receive(shard_no);
-			MyNeonCounters->pageserver_open_requests--;
-		}
-		PG_CATCH();
-		{
-			/*
-			 * Cancellation in this code needs to be handled better at some
-			 * point, but this currently seems fine for now.
-			 */
-			page_server->disconnect(shard_no);
-			MyNeonCounters->pageserver_open_requests = 0;
-
-			/*
-			 * We know for sure we're not working on any prefetch pages after
-			 * this.
-			 */
-			END_PREFETCH_RECEIVE_WORK();
-
-			PG_RE_THROW();
-		}
-		PG_END_TRY();
-
-	} while (resp == NULL);
-
-	return resp;
-}
-
-
-StringInfoData
-nm_pack_request(NeonRequest *msg)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-
-	pq_sendbyte(&s, msg->tag);
-	if (neon_protocol_version >= 3)
-	{
-		pq_sendint64(&s, msg->reqid);
-	}
-	pq_sendint64(&s, msg->lsn);
-	pq_sendint64(&s, msg->not_modified_since);
-
-	switch (messageTag(msg))
-	{
-			/* pagestore_client -> pagestore */
-		case T_NeonExistsRequest:
-			{
-				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
-
-				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
-				pq_sendbyte(&s, msg_req->forknum);
-
-				break;
-			}
-		case T_NeonNblocksRequest:
-			{
-				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
-
-				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
-				pq_sendbyte(&s, msg_req->forknum);
-
-				break;
-			}
-		case T_NeonDbSizeRequest:
-			{
-				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
-
-				pq_sendint32(&s, msg_req->dbNode);
-
-				break;
-			}
-		case T_NeonGetPageRequest:
-			{
-				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
-
-				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
-				pq_sendbyte(&s, msg_req->forknum);
-				pq_sendint32(&s, msg_req->blkno);
-
-				break;
-			}
-
-		case T_NeonGetSlruSegmentRequest:
-			{
-				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
-
-				pq_sendbyte(&s, msg_req->kind);
-				pq_sendint32(&s, msg_req->segno);
-
-				break;
-			}
-
-			/* pagestore -> pagestore_client. We never need to create these. */
-		case T_NeonExistsResponse:
-		case T_NeonNblocksResponse:
-		case T_NeonGetPageResponse:
-		case T_NeonErrorResponse:
-		case T_NeonDbSizeResponse:
-		case T_NeonGetSlruSegmentResponse:
-		default:
-			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
-			break;
-	}
-	return s;
-}
-
-NeonResponse *
-nm_unpack_response(StringInfo s)
-{
-	NeonMessageTag tag = pq_getmsgbyte(s);
-	NeonResponse resp_hdr = {0}; /* make valgrind happy */
-	NeonResponse *resp = NULL;
-
-	resp_hdr.tag = tag;
-	if (neon_protocol_version >= 3)
-	{
-		resp_hdr.reqid = pq_getmsgint64(s);
-		resp_hdr.lsn = pq_getmsgint64(s);
-		resp_hdr.not_modified_since = pq_getmsgint64(s);
-	}
-	switch (tag)
-	{
-			/* pagestore -> pagestore_client */
-		case T_NeonExistsResponse:
-			{
-				NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					msg_resp->req.forknum = pq_getmsgbyte(s);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				msg_resp->exists = pq_getmsgbyte(s);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonNblocksResponse:
-			{
-				NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					msg_resp->req.forknum = pq_getmsgbyte(s);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				msg_resp->n_blocks = pq_getmsgint(s, 4);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonGetPageResponse:
-			{
-				NeonGetPageResponse *msg_resp;
-
-				msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
-				if (neon_protocol_version >= 3)
-				{
-					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					msg_resp->req.forknum = pq_getmsgbyte(s);
-					msg_resp->req.blkno = pq_getmsgint(s, 4);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				/* XXX:	should be varlena */
-				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
-				pq_getmsgend(s);
-
-				Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonDbSizeResponse:
-			{
-				NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					msg_resp->req.dbNode = pq_getmsgint(s, 4);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				msg_resp->db_size = pq_getmsgint64(s);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonErrorResponse:
-			{
-				NeonErrorResponse *msg_resp;
-				size_t		msglen;
-				const char *msgtext;
-
-				msgtext = pq_getmsgrawstring(s);
-				msglen = strlen(msgtext);
-
-				msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
-				msg_resp->req = resp_hdr;
-				memcpy(msg_resp->message, msgtext, msglen + 1);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonGetSlruSegmentResponse:
-		    {
-				NeonGetSlruSegmentResponse *msg_resp;
-				int n_blocks;
-				msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					msg_resp->req.kind = pq_getmsgbyte(s);
-					msg_resp->req.segno = pq_getmsgint(s, 4);
-				}
-				msg_resp->req.hdr = resp_hdr;
-
-				n_blocks = pq_getmsgint(s, 4);
-				msg_resp->n_blocks = n_blocks;
-				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-			/*
-			 * pagestore_client -> pagestore
-			 *
-			 * We create these ourselves, and don't need to decode them.
-			 */
-		case T_NeonExistsRequest:
-		case T_NeonNblocksRequest:
-		case T_NeonGetPageRequest:
-		case T_NeonDbSizeRequest:
-		case T_NeonGetSlruSegmentRequest:
-		default:
-			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
-			break;
-	}
-
-	return resp;
-}
-
-/* dump to json for debugging / error reporting purposes */
-char *
-nm_to_string(NeonMessage *msg)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-
-	switch (messageTag(msg))
-	{
-			/* pagestore_client -> pagestore */
-		case T_NeonExistsRequest:
-			{
-				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
-				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-
-		case T_NeonNblocksRequest:
-			{
-				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
-				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-
-		case T_NeonGetPageRequest:
-			{
-				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
-				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonDbSizeRequest:
-			{
-				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
-				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonGetSlruSegmentRequest:
-			{
-				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
-				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
-				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-			/* pagestore -> pagestore_client */
-		case T_NeonExistsResponse:
-			{
-				NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\"");
-				appendStringInfo(&s, ", \"exists\": %d}",
-								 msg_resp->exists);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_NeonNblocksResponse:
-			{
-				NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\"");
-				appendStringInfo(&s, ", \"n_blocks\": %u}",
-								 msg_resp->n_blocks);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_NeonGetPageResponse:
-			{
-#if 0
-				NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg;
-#endif
-
-				appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\"");
-				appendStringInfo(&s, ", \"page\": \"XXX\"}");
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonErrorResponse:
-			{
-				NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg;
-
-				/* FIXME: escape double-quotes in the message */
-				appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\"");
-				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonDbSizeResponse:
-			{
-				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
-				appendStringInfo(&s, ", \"db_size\": %ld}",
-								 msg_resp->db_size);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_NeonGetSlruSegmentResponse:
-			{
-				NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\"");
-				appendStringInfo(&s, ", \"n_blocks\": %u}",
-								 msg_resp->n_blocks);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-
-		default:
-			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
-	}
-	return s.data;
-}
-
 /*
  * Wrapper around log_newpage() that makes a temporary copy of the block and
  * WAL-logs that. This makes it safe to use while holding only a shared lock
@@ -2149,11 +461,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 static void
 neon_init(void)
 {
-	Size		prfs_size;
-
-	if (MyPState != NULL)
-		return;
-
 	/*
 	 * Sanity check that theperf counters array is sized correctly. We got
 	 * this wrong once, and the formula for max number of backends and aux
@@ -2168,27 +475,6 @@ neon_init(void)
 		elog(ERROR, "MyNeonCounters points past end of array");
 #endif
 
-	prfs_size = offsetof(PrefetchState, prf_buffer) +
-		sizeof(PrefetchRequest) * readahead_buffer_size;
-
-	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
-
-	MyPState->n_unused = readahead_buffer_size;
-
-	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
-										 "NeonSMGR/prefetch",
-										 SLAB_DEFAULT_BLOCK_SIZE * 17,
-										 PS_GETPAGERESPONSE_SIZE);
-	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
-											 "NeonSMGR/errors",
-											 ALLOCSET_DEFAULT_SIZES);
-	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
-											  "NeonSMGR/prefetch",
-											  ALLOCSET_DEFAULT_SIZES);
-
-	MyPState->prf_hash = prfh_create(MyPState->hashctx,
-									 readahead_buffer_size, NULL);
-
 	old_redo_read_buffer_filter = redo_read_buffer_filter;
 	redo_read_buffer_filter = neon_redo_read_buffer_filter;
 
@@ -2225,8 +511,10 @@ nm_adjust_lsn(XLogRecPtr lsn)
 
 /*
  * Return LSN for requesting pages and number of blocks from page server
+ *
+ * XXX: exposed so that prefetch_do_request() can call back here.
  */
-static void
+void
 neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 					  neon_request_lsns *output, BlockNumber nblocks)
 {
@@ -2429,112 +717,12 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	}
 }
 
-/*
- *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
- *
- * This is used to check if the response to a prefetch request can be used to
- * satisfy a page read now.
- */
-static bool
-neon_prefetch_response_usable(neon_request_lsns *request_lsns,
-							  PrefetchRequest *slot)
-{
-	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsns->request_lsn >= request_lsns->not_modified_since);
-	Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since);
-	Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn);
-	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
-	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
-	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
-	Assert(slot->status != PRFS_UNUSED);
-
-	/*
-	 * The new request's LSN should never be older than the old one.  This
-	 * could be an Assert, except that for testing purposes, we do provide an
-	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
-	 * violates this.
-	 *
-	 * Similarly, the not_modified_since value calculated for a page should
-	 * never move backwards. This assumption is a bit fragile; if we updated
-	 * the last-written cache when we read in a page, for example, then it
-	 * might. But as the code stands, it should not.
-	 *
-	 * (If two backends issue a request at the same time, they might race and
-	 * calculate LSNs "out of order" with each other, but the prefetch queue
-	 * is backend-private at the moment.)
-	 */
-	if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn ||
-		request_lsns->not_modified_since < slot->request_lsns.not_modified_since)
-	{
-		ereport(LOG,
-				(errcode(ERRCODE_IO_ERROR),
-				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
-				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsns->effective_request_lsn),
-						   LSN_FORMAT_ARGS(request_lsns->not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
-						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
-		return false;
-	}
-
-	/*---
-	 * Each request to the pageserver has three LSN values associated with it:
-	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
-	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
-	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
-	 * we remember `effective_request_lsn` separately. In a primary,
-	 * `effective_request_lsn` is the same as  `not_modified_since`.
-	 * See comments in neon_get_request_lsns why we can not use last flush WAL position here.
-	 *
-	 * To determine whether a response to a GetPage request issued earlier is
-	 * still valid to satisfy a new page read, we look at the
-	 * (not_modified_since, effective_request_lsn] range of the request. It is
-	 * effectively a claim that the page has not been modified between those
-	 * LSNs.  If the range of the old request in the queue overlaps with the
-	 * new request, we know that the page hasn't been modified in the union of
-	 * the ranges. We can use the response to old request to satisfy the new
-	 * request in that case. For example:
-	 *
-	 *              100      500
-	 * Old request:  +--------+
-	 *
-	 *                     400      800
-	 * New request:         +--------+
-	 *
-	 * The old request claims that the page was not modified between LSNs 100
-	 * and 500, and the second claims that it was not modified between 400 and
-	 * 800. Together they mean that the page was not modified between 100 and
-	 * 800. Therefore the response to the old request is also valid for the
-	 * new request.
-	 *
-	 * This logic also holds at the boundary case that the old request's LSN
-	 * matches the new request's not_modified_since LSN exactly:
-	 *
-	 *              100      500
-	 * Old request:  +--------+
-	 *
-	 *                       500      900
-	 * New request:           +--------+
-	 *
-	 * The response to the old request is the page as it was at LSN 500, and
-	 * the page hasn't been changed in the range (500, 900], therefore the
-	 * response is valid also for the new request.
-	 */
-
-	/* this follows from the checks above */
-	Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since);
-
-	return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn;
-}
-
 /*
  *	neon_exists() -- Does the physical file exist?
  */
 static bool
 neon_exists(SMgrRelation reln, ForkNumber forkNum)
 {
-	bool		exists;
-	NeonResponse *resp;
 	BlockNumber n_blocks;
 	neon_request_lsns request_lsns;
 
@@ -2593,67 +781,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
 						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
-	{
-		NeonExistsRequest request = {
-			.hdr.tag = T_NeonExistsRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
-			.hdr.lsn = request_lsns.request_lsn,
-			.hdr.not_modified_since = request_lsns.not_modified_since,
-			.rinfo = InfoFromSMgrRel(reln),
-			.forknum = forkNum
-		};
 
-		resp = page_server_request(&request);
-
-		switch (resp->tag)
-		{
-			case T_NeonExistsResponse:
-			{
-				NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr) ||
-						!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
-						exists_resp->req.forknum != request.forknum)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
-													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
-					}
-				}
-				exists = exists_resp->exists;
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr))
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-								resp->reqid,
-								RelFileInfoFmt(InfoFromSMgrRel(reln)),
-								forkNum,
-								LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-
-			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-											"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
-											T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
-		}
-		pfree(resp);
-	}
-	return exists;
+	return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
 }
 
 /*
@@ -3002,7 +1131,6 @@ static bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			  int nblocks)
 {
-	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
 	BufferTag	tag;
 
 	switch (reln->smgr_relpersistence)
@@ -3039,17 +1167,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 		tag.blockNum = blocknum;
 
-		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
-											   lfc_present, true);
+		communicator_prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present);
 
 		nblocks -= iterblocks;
 		blocknum += iterblocks;
-
-		Assert(ring_index < MyPState->ring_unused &&
-			   MyPState->ring_last <= ring_index);
 	}
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	return false;
 }
@@ -3062,7 +1186,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 static bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
 	BufferTag	tag;
 
 	switch (reln->smgr_relpersistence)
@@ -3087,12 +1210,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
-	ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true);
+	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
 
-	Assert(ring_index < MyPState->ring_unused &&
-		   MyPState->ring_last <= ring_index);
-
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	return false;
 }
@@ -3136,7 +1256,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -3144,208 +1264,6 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 #endif
 }
 
-/*
- * Read N pages at a specific LSN.
- *
- * *mask is set for pages read at a previous point in time, and which we
- * should not touch, nor overwrite.
- * New bits should be set in *mask for the pages we'successfully read.
- *
- * The offsets in request_lsns, buffers, and mask are linked.
- */
-static void
-neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
-				  void **buffers, BlockNumber nblocks, const bits8 *mask)
-{
-	NeonResponse *resp;
-	uint64		ring_index;
-	PrfHashEntry *entry;
-	PrefetchRequest *slot;
-	PrefetchRequest hashkey;
-
-	Assert(PointerIsValid(request_lsns));
-	Assert(nblocks >= 1);
-
-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
-	hashkey.buftag.forkNum = forkNum;
-	hashkey.buftag.blockNum = base_blockno;
-
-	/*
-	 * The redo process does not lock pages that it needs to replay but are
-	 * not in the shared buffers, so a concurrent process may request the page
-	 * after redo has decided it won't redo that page and updated the LwLSN
-	 * for that page. If we're in hot standby we need to take care that we
-	 * don't return until after REDO has finished replaying up to that LwLSN,
-	 * as the page should have been locked up to that point.
-	 *
-	 * See also the description on neon_redo_read_buffer_filter below.
-	 *
-	 * NOTE: It is possible that the WAL redo process will still do IO due to
-	 * concurrent failed read IOs. Those IOs should never have a request_lsn
-	 * that is as large as the WAL record we're currently replaying, if it
-	 * weren't for the behaviour of the LwLsn cache that uses the highest
-	 * value of the LwLsn cache when the entry is not found.
-	 */
-	prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false);
-
-	for (int i = 0; i < nblocks; i++)
-	{
-		void	   *buffer = buffers[i];
-		BlockNumber blockno = base_blockno + i;
-		neon_request_lsns *reqlsns = &request_lsns[i];
-		TimestampTz		start_ts, end_ts;
-
-		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
-			continue;
-
-		start_ts = GetCurrentTimestamp();
-
-		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
-			XLogWaitForReplayOf(reqlsns->request_lsn);
-
-		/*
-		 * Try to find prefetched page in the list of received pages.
-		 */
-Retry:
-		hashkey.buftag.blockNum = blockno;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
-
-		if (entry != NULL)
-		{
-			slot = entry->slot;
-			if (neon_prefetch_response_usable(reqlsns, slot))
-			{
-				ring_index = slot->my_ring_index;
-			}
-			else
-			{
-				/*
-				 * Cannot use this prefetch, discard it
-				 *
-				 * We can't drop cache for not-yet-received requested items. It is
-				 * unlikely this happens, but it can happen if prefetch distance
-				 * is large enough and a backend didn't consume all prefetch
-				 * requests.
-				 */
-				if (slot->status == PRFS_REQUESTED)
-				{
-					if (!prefetch_wait_for(slot->my_ring_index))
-						goto Retry;
-				}
-				/* drop caches */
-				prefetch_set_unused(slot->my_ring_index);
-				pgBufferUsage.prefetch.expired += 1;
-				MyNeonCounters->getpage_prefetch_discards_total++;
-				/* make it look like a prefetch cache miss */
-				entry = NULL;
-			}
-		}
-
-		do
-		{
-			if (entry == NULL)
-			{
-				ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false);
-				Assert(ring_index != UINT64_MAX);
-				slot = GetPrfSlot(ring_index);
-			}
-			else
-			{
-				/*
-				 * Empty our reference to the prefetch buffer's hash entry. When
-				 * we wait for prefetches, the entry reference is invalidated by
-				 * potential updates to the hash, and when we reconnect to the
-				 * pageserver the prefetch we're waiting for may be dropped, in
-				 * which case we need to retry and take the branch above.
-				 */
-				entry = NULL;
-			}
-
-			Assert(slot->my_ring_index == ring_index);
-			Assert(MyPState->ring_last <= ring_index &&
-				   MyPState->ring_unused > ring_index);
-			Assert(slot->status != PRFS_UNUSED);
-			Assert(GetPrfSlot(ring_index) == slot);
-
-		} while (!prefetch_wait_for(ring_index));
-
-		Assert(slot->status == PRFS_RECEIVED);
-		Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
-		Assert(hashkey.buftag.blockNum == base_blockno + i);
-
-		resp = slot->response;
-
-		switch (resp->tag)
-		{
-			case T_NeonGetPageResponse:
-			{
-				NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (resp->reqid != slot->reqid ||
-						resp->lsn != slot->request_lsns.request_lsn ||
-						resp->not_modified_since != slot->request_lsns.not_modified_since ||
-						!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
-						getpage_resp->req.forknum != forkNum ||
-						getpage_resp->req.blkno != base_blockno + i)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
-													slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
-					}
-				}
-				memcpy(buffer, getpage_resp->page, BLCKSZ);
-
-				/*
-				 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
-				 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
-				 * under buffer lock.
-				 */
-				if (!lfc_store_prefetch_result)
-					lfc_write(rinfo, forkNum, blockno, buffer);
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (resp->reqid != slot->reqid ||
-						resp->lsn != slot->request_lsns.request_lsn ||
-						resp->not_modified_since != slot->request_lsns.not_modified_since)
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
-								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-			default:
-				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-											T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
-		}
-
-		/* buffer was used, clean up for later reuse */
-		prefetch_set_unused(ring_index);
-		prefetch_cleanup_trailing_unused();
-
-		end_ts = GetCurrentTimestamp();
-		inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
-	}
-}
-
 /*
  * While function is defined in the neon extension it's used within neon_test_utils directly.
  * To avoid breaking tests in the runtime please keep function signature in sync.
@@ -3354,7 +1272,7 @@ void
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 neon_request_lsns request_lsns, void *buffer)
 {
-	neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
+	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }
 
 #if PG_MAJORVERSION_NUM < 17
@@ -3370,6 +1288,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 #endif
 {
 	neon_request_lsns request_lsns;
+	bits8		present;
+	void	   *bufferp;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -3389,11 +1309,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	}
 
 	/* Try to read PS results if they are available */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
 
-	if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer))
+	present = 0;
+	bufferp = buffer;
+	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
 	{
 		/* Prefetch hit */
 		return;
@@ -3411,7 +1333,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -3521,16 +1443,16 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);
 
 	/* Try to read PS results if they are available */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 						  request_lsns, nblocks);
 
 	memset(read_pages, 0, sizeof(read_pages));
 
-	prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
-									   blocknum, request_lsns, nblocks,
-									   buffers, read_pages);
+	prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
+													blocknum, request_lsns, nblocks,
+													buffers, read_pages);
 
 	if (prefetch_result == nblocks)
 		return;
@@ -3546,13 +1468,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (prefetch_result + lfc_result == nblocks)
 		return;
 
-	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
-					  buffers, nblocks, read_pages);
+	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
+							  buffers, nblocks, read_pages);
 
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -3737,7 +1659,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 
 	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -3799,7 +1721,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 
 	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -3815,7 +1737,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 static BlockNumber
 neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	NeonResponse *resp;
 	BlockNumber n_blocks;
 	neon_request_lsns request_lsns;
 
@@ -3847,74 +1768,15 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
 						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 
-	{
-		NeonNblocksRequest request = {
-			.hdr.tag = T_NeonNblocksRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
-			.hdr.lsn = request_lsns.request_lsn,
-			.hdr.not_modified_since = request_lsns.not_modified_since,
-			.rinfo = InfoFromSMgrRel(reln),
-			.forknum = forknum,
-		};
+	n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
+	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
-		resp = page_server_request(&request);
+	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+			 forknum,
+			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+			 n_blocks);
 
-		switch (resp->tag)
-		{
-			case T_NeonNblocksResponse:
-			{
-				NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr) ||
-						!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
-						relsize_resp->req.forknum != forknum)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
-													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
-					}
-				}
-				n_blocks = relsize_resp->n_blocks;
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr))
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-								resp->reqid,
-								RelFileInfoFmt(InfoFromSMgrRel(reln)),
-								forknum,
-								LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-
-			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-											"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
-											T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
-		}
-		update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
-
-		neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-				 forknum,
-				 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
-				 n_blocks);
-
-		pfree(resp);
-	}
 	return n_blocks;
 }
 
@@ -3924,7 +1786,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 int64
 neon_dbsize(Oid dbNode)
 {
-	NeonResponse *resp;
 	int64		db_size;
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
@@ -3932,66 +1793,11 @@ neon_dbsize(Oid dbNode)
 	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
 						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 
-	{
-		NeonDbSizeRequest request = {
-			.hdr.tag = T_NeonDbSizeRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
-			.hdr.lsn = request_lsns.request_lsn,
-			.hdr.not_modified_since = request_lsns.not_modified_since,
-			.dbNode = dbNode,
-		};
+	db_size = communicator_dbsize(dbNode, &request_lsns);
 
-		resp = page_server_request(&request);
+	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
 
-		switch (resp->tag)
-		{
-			case T_NeonDbSizeResponse:
-			{
-				NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr) ||
-						dbsize_resp->req.dbNode != dbNode)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
-													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
-					}
-				}
-				db_size = dbsize_resp->db_size;
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr))
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
-								resp->reqid,
-								dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-
-			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-											"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
-											T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
-		}
-
-		neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-				 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
-
-		pfree(resp);
-	}
 	return db_size;
 }
 
@@ -4090,7 +1896,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 
 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -4291,9 +2097,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 				not_modified_since;
 	SlruKind	kind;
 	int			n_blocks;
-	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
-	NeonResponse *resp;
-	NeonGetSlruSegmentRequest request;
+	neon_request_lsns request_lsns;
 
 	/*
 	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
@@ -4332,74 +2136,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	else
 		return -1;
 
-	request = (NeonGetSlruSegmentRequest) {
-		.hdr.tag = T_NeonGetSlruSegmentRequest,
-		.hdr.reqid = GENERATE_REQUEST_ID(),
-		.hdr.lsn = request_lsn,
-		.hdr.not_modified_since = not_modified_since,
-		.kind = kind,
-		.segno = segno
-	};
+	request_lsns.request_lsn = request_lsn;
+	request_lsns.not_modified_since = not_modified_since;
+	request_lsns.effective_request_lsn = request_lsn;
 
-	do
-	{
-		while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
+	n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
 
-		consume_prefetch_responses();
-
-		resp = page_server->receive(shard_no);
-	} while (resp == NULL);
-
-	switch (resp->tag)
-	{
-		case T_NeonGetSlruSegmentResponse:
-		{
-			NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
-			if (neon_protocol_version >= 3)
-			{
-				if (!equal_requests(resp, &request.hdr) ||
-					slru_resp->req.kind != kind ||
-					slru_resp->req.segno != segno)
-				{
-					NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}",
-												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
-												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno);
-				}
-			}
-			n_blocks = slru_resp->n_blocks;
-			memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
-			break;
-		}
-		case T_NeonErrorResponse:
-			if (neon_protocol_version >= 3)
-			{
-				if (!equal_requests(resp, &request.hdr))
-				{
-					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-				}
-			}
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X",
-							resp->reqid,
-							kind,
-							segno,
-							LSN_FORMAT_ARGS(request_lsn)),
-					 errdetail("page server returned error: %s",
-							   ((NeonErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
-										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
-	}
-	pfree(resp);
-
-	reconfigure_timeout_if_needed();
 	return n_blocks;
 }
 
@@ -4435,7 +2177,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			}
 			break;
 	}
-	reconfigure_timeout_if_needed();
+	communicator_reconfigure_timeout_if_needed();
 }
 
 static const struct f_smgr neon_smgr =
@@ -4493,6 +2235,7 @@ smgr_init_neon(void)
 
 	smgr_init_standard();
 	neon_init();
+	communicator_init();
 }
 
 
@@ -4522,25 +2265,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * This length is later reused when we open the smgr to read the
 		 * block, which is fine and expected.
 		 */
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.hdr = (NeonRequest) {
-				.tag = T_NeonNblocksRequest,
-				.reqid = GENERATE_REQUEST_ID(),
-				.lsn = end_recptr,
-				.not_modified_since = end_recptr,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
+		neon_request_lsns request_lsns;
 
-		response = page_server_request(&request);
+		neon_get_request_lsns(rinfo, forknum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
+		relsize = communicator_nblocks(rinfo, forknum, &request_lsns);
 
-		relsize = Max(nbresponse->n_blocks, blkno + 1);
+		relsize = Max(relsize, blkno + 1);
 
 		set_cached_relsize(rinfo, forknum, relsize);
 		neon_set_lwlsn_relation(end_recptr, rinfo, forknum);
@@ -4692,94 +2424,3 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	}
 	return no_redo_needed;
 }
-
-static void
-reconfigure_timeout_if_needed(void)
-{
-	bool	needs_set = MyPState->ring_receive != MyPState->ring_unused &&
-						readahead_getpage_pull_timeout_ms > 0;
-
-	if (needs_set != timeout_set)
-	{
-		/* The background writer doens't (shouldn't) read any pages */
-		Assert(!AmBackgroundWriterProcess());
-		/* The checkpointer doens't (shouldn't) read any pages */
-		Assert(!AmCheckpointerProcess());
-
-		if (unlikely(PS_TIMEOUT_ID == 0))
-		{
-			PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler);
-		}
-
-		if (needs_set)
-		{
-#if PG_MAJORVERSION_NUM <= 14
-			enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms);
-#else
-			enable_timeout_every(
-				PS_TIMEOUT_ID,
-				TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
-											readahead_getpage_pull_timeout_ms),
-				readahead_getpage_pull_timeout_ms
-			);
-#endif
-			timeout_set = true;
-		}
-		else
-		{
-			Assert(timeout_set);
-			disable_timeout(PS_TIMEOUT_ID, false);
-			timeout_set = false;
-		}
-	}
-}
-
-static void
-pagestore_timeout_handler(void)
-{
-#if PG_MAJORVERSION_NUM <= 14
-	/*
-	 * PG14: Setting a repeating timeout is not possible, so we signal here
-	 * that the timeout has already been reset, and by telling the system
-	 * that system will re-schedule it later if we need to.
-	 */
-	timeout_set = false;
-#endif
-	timeout_signaled = true;
-	InterruptPending = true;
-}
-
-static process_interrupts_callback_t prev_interrupt_cb;
-
-/*
- * Process new data received in our active PageStream sockets.
- *
- * This relies on the invariant that all pipelined yet-to-be-received requests
- * are getPage requests managed by MyPState. This is currently true, any
- * modification will probably require some stuff to make it work again.
- */
-static bool
-pagestore_smgr_processinterrupts(void)
-{
-	if (timeout_signaled)
-	{
-		if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
-			prefetch_pump_state(true);
-
-		timeout_signaled = false;
-		reconfigure_timeout_if_needed();
-	}
-
-	if (!prev_interrupt_cb)
-		return false;
-
-	return prev_interrupt_cb();
-}
-
-
-void
-pagestore_smgr_init(void)
-{
-	prev_interrupt_cb = ProcessInterruptsCallback;
-	ProcessInterruptsCallback = pagestore_smgr_processinterrupts;
-}

From e7502a3d637932a59ee502ababb1df3d0e3bca26 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Wed, 9 Apr 2025 17:16:15 +0400
Subject: [PATCH 085/140] pageserver: return 412 PreconditionFailed in
 get_timestamp_of_lsn if timestamp is not found (#11491)

## Problem
Now `get_timestamp_of_lsn` returns `404 NotFound` if there is no clog
pages for given LSN, and it's difficult to distinguish from other 404
errors. A separate status code for this error will allow the control
plane to handle this case.
- Closes: https://github.com/neondatabase/neon/issues/11439
- Corresponding PR in control plane:
https://github.com/neondatabase/cloud/pull/27125

## Summary of changes
- Return `412 PreconditionFailed` instead of `404 NotFound` if no
timestamp is fond for given LSN.

I looked briefly through the current error handling code in cloud.git
and the status code change should not affect anything for the existing
code. Change from the corresponding PR also looks fine and should work
with the current PS status code. Additionally, here is OK to merge it
from control plane team:
https://github.com/neondatabase/neon/issues/11439#issuecomment-2789327552

---------

Co-authored-by: John Spray <john@neon.tech>
---
 pageserver/src/http/openapi_spec.yml    |  6 +++++
 pageserver/src/http/routes.rs           |  8 +++----
 pageserver/src/pgdatadir_mapping.rs     |  6 ++---
 test_runner/regress/test_lsn_mapping.py | 31 +++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 566086c527..7ea148971f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -212,6 +212,12 @@ paths:
               schema:
                 type: string
                 format: date-time
+        "412":
+          description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bce590016e..e979a70aec 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -989,7 +989,7 @@ async fn get_lsn_by_timestamp_handler(
     if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
+            "Lsn calculations by timestamp are only available on shard zero"
         )));
     }
 
@@ -1064,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler(
     if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
+            "Timestamp calculations by lsn are only available on shard zero"
         )));
     }
 
@@ -1090,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler(
             .to_string();
             json_response(StatusCode::OK, time)
         }
-        None => Err(ApiError::NotFound(
-            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
+        None => Err(ApiError::PreconditionFailed(
+            format!("Timestamp for lsn {} not found", lsn).into(),
         )),
     }
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index e3e06ab91a..4c5a07ba57 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -691,7 +691,7 @@ impl Timeline {
         Ok(buf.get_u32_le())
     }
 
-    /// Get size of an SLRU segment
+    /// Does the slru segment exist?
     pub(crate) async fn get_slru_segment_exists(
         &self,
         kind: SlruKind,
@@ -844,9 +844,9 @@ impl Timeline {
         .await
     }
 
-    /// Obtain the possible timestamp range for the given lsn.
+    /// Obtain the timestamp for the given lsn.
     ///
-    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
+    /// If the lsn has no timestamps (e.g. no commits), returns None.
     pub(crate) async fn get_timestamp_for_lsn(
         &self,
         probe_lsn: Lsn,
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 7280a91a12..c5a1bf0d16 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -276,3 +276,34 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
             if i > 1:
                 before_timestamp = tbl[i - step_size][1]
                 assert timestamp >= before_timestamp, "before_timestamp before timestamp"
+
+
+def test_timestamp_of_lsn_empty_branch(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that getting the timestamp of the head LSN of a newly created branch works.
+    This verifies that we don't get a 404 error when trying to get the timestamp
+    of the head LSN of a branch that was just created.
+    We now return a special status code 412 to indicate if there is no timestamp found for lsn.
+
+    Reproducer for https://github.com/neondatabase/neon/issues/11439
+    """
+    env = neon_env_builder.init_start()
+
+    # Create a new branch
+    new_timeline_id = env.create_branch("test_timestamp_of_lsn_empty_branch")
+
+    # Retrieve the commit LSN of the empty branch, which we have never run postgres on
+    detail = env.pageserver.http_client().timeline_detail(
+        tenant_id=env.initial_tenant, timeline_id=new_timeline_id
+    )
+    head_lsn = detail["last_record_lsn"]
+
+    # Verify that we get 412 status code
+    with env.pageserver.http_client() as client:
+        with pytest.raises(PageserverApiException) as err:
+            client.timeline_get_timestamp_of_lsn(
+                env.initial_tenant,
+                new_timeline_id,
+                head_lsn,
+            )
+        assert err.value.status_code == 412

From d11f23a3419a5b8eef62bc5736a4dd9d413bdab8 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 9 Apr 2025 14:17:02 +0100
Subject: [PATCH 086/140] pageserver: refactor read path for multi LSN batching
 support (#11463)

## Problem

We wish to improve pageserver batching such that one batch can contain
requests for
pages at different LSNs. The current shape of the code doesn't lend
itself to the change.

## Summary of changes

Refactor the read path such that the fringe gets initialized upfront.
This is where the multi LSN
change will plug in. A couple other small changes fell out of this.

There should be NO behaviour change here. If you smell one, shout!

I recommend reviewing commits individually (intentionally made them as
small as possible).

Related: https://github.com/neondatabase/neon/issues/10765
---
 pageserver/src/tenant/timeline.rs             | 87 ++++++++-----------
 .../src/tenant/timeline/layer_manager.rs      | 37 +++++++-
 2 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5174da0f43..ca34cbaf48 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -115,7 +115,7 @@ use crate::pgdatadir_mapping::{
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::gc_result::GcResult;
-use crate::tenant::layer_map::{LayerMap, SearchResult};
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
@@ -4104,12 +4104,6 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<TimelineVisitOutcome, GetVectoredError> {
-        let mut unmapped_keyspace = keyspace.clone();
-        let mut fringe = LayerFringe::new();
-
-        let mut completed_keyspace = KeySpace::default();
-        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
-
         // Prevent GC from progressing while visiting the current timeline.
         // If we are GC-ing because a new image layer was added while traversing
         // the timeline, then it will remove layers that are required for fulfilling
@@ -4120,11 +4114,44 @@ impl Timeline {
         // See `compaction::compact_with_gc` for why we need this.
         let _guard = timeline.gc_compaction_layer_update_lock.read().await;
 
-        loop {
+        // Initialize the fringe
+        let mut fringe = {
+            let mut fringe = LayerFringe::new();
+
+            let guard = timeline.layers.read().await;
+            guard.update_search_fringe(&keyspace, cont_lsn, &mut fringe)?;
+
+            fringe
+        };
+
+        let mut completed_keyspace = KeySpace::default();
+        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
+
+        while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
+            if let Some(ref mut read_path) = reconstruct_state.read_path {
+                read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
+            }
+
+            // Visit the layer and plan IOs for it
+            let next_cont_lsn = lsn_range.start;
+            layer_to_read
+                .get_values_reconstruct_data(
+                    keyspace_to_read.clone(),
+                    lsn_range,
+                    reconstruct_state,
+                    ctx,
+                )
+                .await?;
+
+            let mut unmapped_keyspace = keyspace_to_read;
+            cont_lsn = next_cont_lsn;
+
+            reconstruct_state.on_layer_visited(&layer_to_read);
+
             let (keys_done_last_step, keys_with_image_coverage) =
                 reconstruct_state.consume_done_keys();
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
@@ -4135,31 +4162,15 @@ impl Timeline {
                 image_covered_keyspace.add_range(keys_with_image_coverage);
             }
 
+            // Query the layer map for the next layers to read.
+            //
             // Do not descent any further if the last layer we visited
             // completed all keys in the keyspace it inspected. This is not
             // required for correctness, but avoids visiting extra layers
             // which turns out to be a perf bottleneck in some cases.
             if !unmapped_keyspace.is_empty() {
                 let guard = timeline.layers.read().await;
-                let layers = guard.layer_map()?;
-
-                for range in unmapped_keyspace.ranges.iter() {
-                    let results = layers.range_search(range.clone(), cont_lsn);
-
-                    results
-                        .found
-                        .into_iter()
-                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                            (
-                                guard.upgrade(layer),
-                                keyspace_accum.to_keyspace(),
-                                lsn_floor..cont_lsn,
-                            )
-                        })
-                        .for_each(|(layer, keyspace, lsn_range)| {
-                            fringe.update(layer, keyspace, lsn_range)
-                        });
-                }
+                guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;
 
                 // It's safe to drop the layer map lock after planning the next round of reads.
                 // The fringe keeps readable handles for the layers which are safe to read even
@@ -4173,28 +4184,6 @@ impl Timeline {
                 // at two different time points.
                 drop(guard);
             }
-
-            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
-                if let Some(ref mut read_path) = reconstruct_state.read_path {
-                    read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
-                }
-                let next_cont_lsn = lsn_range.start;
-                layer_to_read
-                    .get_values_reconstruct_data(
-                        keyspace_to_read.clone(),
-                        lsn_range,
-                        reconstruct_state,
-                        ctx,
-                    )
-                    .await?;
-
-                unmapped_keyspace = keyspace_to_read;
-                cont_lsn = next_cont_lsn;
-
-                reconstruct_state.on_layer_visited(&layer_to_read);
-            } else {
-                break;
-            }
         }
 
         Ok(TimelineVisitOutcome {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index ed92ea28ce..ae898260d2 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -3,17 +3,18 @@ use std::sync::Arc;
 
 use anyhow::{Context, bail, ensure};
 use itertools::Itertools;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};
 
-use super::{ReadableLayer, TimelineWriterState};
+use super::{LayerFringe, ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
-use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
+use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult};
 use crate::tenant::storage_layer::{
     AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
     PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
@@ -38,7 +39,7 @@ impl Default for LayerManager {
 }
 
 impl LayerManager {
-    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+    fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
         match weak {
             ReadableLayerWeak::PersistentLayer(desc) => {
                 ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
@@ -147,6 +148,36 @@ impl LayerManager {
         self.layers().keys().cloned().collect_vec()
     }
 
+    /// Update the [`LayerFringe`] of a read request
+    ///
+    /// Take a key space at a given LSN and query the layer map below each range
+    /// of the key space to find the next layers to visit.
+    pub(crate) fn update_search_fringe(
+        &self,
+        keyspace: &KeySpace,
+        cont_lsn: Lsn,
+        fringe: &mut LayerFringe,
+    ) -> Result<(), Shutdown> {
+        let map = self.layer_map()?;
+
+        for range in keyspace.ranges.iter() {
+            let results = map.range_search(range.clone(), cont_lsn);
+            results
+                .found
+                .into_iter()
+                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                    (
+                        self.upgrade(layer),
+                        keyspace_accum.to_keyspace(),
+                        lsn_floor..cont_lsn,
+                    )
+                })
+                .for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range));
+        }
+
+        Ok(())
+    }
+
     fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
         use LayerManager::*;
         match self {

From 72832b32140a78db7612af626d7c69079d73f445 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 9 Apr 2025 16:04:42 +0100
Subject: [PATCH 087/140] chore: fix clippy lints from nightly-2025-03-16
 (#11273)

I like to run nightly clippy every so often to make our future rust
upgrades easier. Some notable changes:

* Prefer `next_back()` over `last()`. Generic iterators will implement
`last()` to run forward through the iterator until the end.

* Prefer `io::Error::other()`.

* Use implicit returns

One case where I haven't dealt with the issues is the now
[more-sensitive "large enum variant"
lint](https://github.com/rust-lang/rust-clippy/pull/13833). I chose not
to take any decisions around it here, and simply marked them as allow
for now.
---
 compute_tools/src/catalog.rs                    | 17 +++++++++--------
 control_plane/storcon_cli/src/main.rs           |  2 +-
 libs/postgres_backend/src/lib.rs                |  7 +++----
 libs/postgres_backend/tests/simple_select.rs    |  4 ++--
 libs/pq_proto/src/framed.rs                     |  2 +-
 libs/pq_proto/src/lib.rs                        |  2 +-
 .../src/authentication/sasl.rs                  |  9 +++------
 libs/remote_storage/src/azure_blob.rs           |  5 ++---
 libs/utils/src/crashsafe.rs                     |  9 +++------
 pageserver/src/http/routes.rs                   | 12 ++++++------
 pageserver/src/tenant.rs                        |  7 +++----
 pageserver/src/tenant/blob_io.rs                |  7 ++-----
 pageserver/src/tenant/block_io.rs               |  8 ++------
 .../tenant/storage_layer/batch_split_writer.rs  |  2 +-
 .../inmemory_layer/vectored_dio_read.rs         |  2 +-
 .../src/tenant/storage_layer/merge_iterator.rs  |  1 +
 pageserver/src/tenant/timeline.rs               |  1 +
 pageserver/src/tenant/upload_queue.rs           |  1 +
 proxy/src/cancellation.rs                       |  7 +------
 proxy/src/proxy/tests/mod.rs                    |  5 ++---
 proxy/src/serverless/conn_pool_lib.rs           |  1 +
 safekeeper/src/timeline.rs                      |  1 +
 safekeeper/src/timeline_eviction.rs             |  5 ++---
 storage_broker/src/bin/storage_broker.rs        |  6 ++++++
 storage_broker/src/lib.rs                       |  1 +
 25 files changed, 57 insertions(+), 67 deletions(-)

diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index db3e07e086..082ba62b8e 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -98,13 +98,15 @@ pub async fn get_database_schema(
         .kill_on_drop(true)
         .spawn()?;
 
-    let stdout = cmd.stdout.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
-    })?;
+    let stdout = cmd
+        .stdout
+        .take()
+        .ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?;
 
-    let stderr = cmd.stderr.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
-    })?;
+    let stderr = cmd
+        .stderr
+        .take()
+        .ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?;
 
     let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
     let stderr_reader = BufReader::new(stderr);
@@ -128,8 +130,7 @@ pub async fn get_database_schema(
                 }
             });
 
-            return Err(SchemaDumpError::IO(std::io::Error::new(
-                std::io::ErrorKind::Other,
+            return Err(SchemaDumpError::IO(std::io::Error::other(
                 "failed to start pg_dump",
             )));
         }
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b7e479d90c..19c686dcfd 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> {
             let mut node_to_fill_descs = Vec::new();
 
             for desc in node_descs {
-                let to_drain = nodes.iter().any(|id| *id == desc.id);
+                let to_drain = nodes.contains(&desc.id);
                 if to_drain {
                     node_to_drain_descs.push(desc);
                 } else {
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index a0a891f0dc..654dde8da6 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -5,7 +5,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use std::future::Future;
-use std::io::ErrorKind;
 use std::net::SocketAddr;
 use std::os::fd::{AsRawFd, RawFd};
 use std::pin::Pin;
@@ -227,7 +226,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
         match self {
             MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
             MaybeWriteOnly::WriteOnly(_) => {
-                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+                Err(io::Error::other("reading from write only half").into())
             }
             MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
         }
@@ -237,7 +236,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
         match self {
             MaybeWriteOnly::Full(framed) => framed.read_message().await,
             MaybeWriteOnly::WriteOnly(_) => {
-                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+                Err(io::Error::other("reading from write only half").into())
             }
             MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
         }
@@ -975,7 +974,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
             .write_message_noflush(&BeMessage::CopyData(buf))
             // write_message only writes to the buffer, so it can fail iff the
             // message is invaid, but CopyData can't be invalid.
-            .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
+            .map_err(|_| io::Error::other("failed to serialize CopyData"))?;
 
         Poll::Ready(Ok(buf.len()))
     }
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 907ef9eed3..75ca123014 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -85,8 +85,8 @@ static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
 
 static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
-    cert
+
+    rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap()
 });
 
 // test that basic select with ssl works
diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 8e216d0f44..4e5e48ecf5 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -35,7 +35,7 @@ impl ConnectionError {
     pub fn into_io_error(self) -> io::Error {
         match self {
             ConnectionError::Io(io) => io,
-            ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
+            ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()),
         }
     }
 }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index e435ffbf7e..e7afc64564 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -257,7 +257,7 @@ pub enum ProtocolError {
 impl ProtocolError {
     /// Proxy stream.rs uses only io::Error; provide it.
     pub fn into_io_error(self) -> io::Error {
-        io::Error::new(io::ErrorKind::Other, self.to_string())
+        io::Error::other(self.to_string())
     }
 }
 
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
index 27e05e24ec..2daf9a80d4 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -212,7 +212,7 @@ impl ScramSha256 {
                     password,
                     channel_binding,
                 } => (nonce, password, channel_binding),
-                _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+                _ => return Err(io::Error::other("invalid SCRAM state")),
             };
 
         let message =
@@ -291,7 +291,7 @@ impl ScramSha256 {
                 server_key,
                 auth_message,
             } => (server_key, auth_message),
-            _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+            _ => return Err(io::Error::other("invalid SCRAM state")),
         };
 
         let message =
@@ -301,10 +301,7 @@ impl ScramSha256 {
 
         let verifier = match parsed {
             ServerFinalMessage::Error(e) => {
-                return Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    format!("SCRAM error: {}", e),
-                ));
+                return Err(io::Error::other(format!("SCRAM error: {}", e)));
             }
             ServerFinalMessage::Verifier(verifier) => verifier,
         };
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index dee61a410d..18146c5464 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -801,8 +801,7 @@ where
             // that support needs to be hacked in.
             //
             // including {self:?} into the message would be useful, but unsure how to unproject.
-            _ => std::task::Poll::Ready(Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
+            _ => std::task::Poll::Ready(Err(std::io::Error::other(
                 "cloned or initial values cannot be read",
             ))),
         }
@@ -855,7 +854,7 @@ where
         };
         Err(azure_core::error::Error::new(
             azure_core::error::ErrorKind::Io,
-            std::io::Error::new(std::io::ErrorKind::Other, msg),
+            std::io::Error::other(msg),
         ))
     }
 
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 290a5b2686..215fa36df4 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -81,12 +81,9 @@ pub fn path_with_suffix_extension(
 }
 
 pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
-    let parent = file_path.parent().ok_or_else(|| {
-        io::Error::new(
-            io::ErrorKind::Other,
-            format!("File {file_path:?} has no parent"),
-        )
-    })?;
+    let parent = file_path
+        .parent()
+        .ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?;
 
     fsync(file_path)?;
     fsync(parent)?;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e979a70aec..200b91fc82 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3381,11 +3381,11 @@ async fn put_tenant_timeline_import_basebackup(
 
         let broker_client = state.broker_client.clone();
 
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
+        let mut body = StreamReader::new(
+            request
+                .into_body()
+                .map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))),
+        );
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
@@ -3459,7 +3459,7 @@ async fn put_tenant_timeline_import_wal(
 
         let mut body = StreamReader::new(request.into_body().map(|res| {
             res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+                std::io::Error::other( anyhow::anyhow!(error))
             })
         }));
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 900e98d7e9..2ac2fd0b81 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -920,6 +920,7 @@ enum StartCreatingTimelineResult {
     Idempotent(Arc<Timeline>),
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum TimelineInitAndSyncResult {
     ReadyToActivate(Arc<Timeline>),
     NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
@@ -1006,6 +1007,7 @@ enum CreateTimelineCause {
     Delete,
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum LoadTimelineCause {
     Attach,
     Unoffload,
@@ -4399,10 +4401,7 @@ impl Tenant {
         .to_string();
 
         fail::fail_point!("tenant-config-before-write", |_| {
-            Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "tenant-config-before-write",
-            ))
+            Err(std::io::Error::other("tenant-config-before-write"))
         });
 
         // Convert the config to a toml file.
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index ff9a7e57b6..b0b2a16c2f 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -15,7 +15,7 @@
 //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use std::cmp::min;
-use std::io::{Error, ErrorKind};
+use std::io::Error;
 
 use async_compression::Level;
 use bytes::{BufMut, BytesMut};
@@ -331,10 +331,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                     return (
                         (
                             io_buf.slice_len(),
-                            Err(Error::new(
-                                ErrorKind::Other,
-                                format!("blob too large ({len} bytes)"),
-                            )),
+                            Err(Error::other(format!("blob too large ({len} bytes)"))),
                         ),
                         srcbuf,
                     );
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 66c586daff..6723155626 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -216,12 +216,8 @@ impl<'a> FileBlockReader<'a> {
         match cache
             .read_immutable_buf(self.file_id, blknum, ctx)
             .await
-            .map_err(|e| {
-                std::io::Error::new(
-                    std::io::ErrorKind::Other,
-                    format!("Failed to read immutable buf: {e:#}"),
-                )
-            })? {
+            .map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))?
+        {
             ReadBufResult::Found(guard) => Ok(guard.into()),
             ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index fd50e4805d..29ada15c36 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -366,7 +366,7 @@ impl SplitDeltaLayerWriter {
                 )
                 .await?;
                 let (start_key, prev_delta_writer) =
-                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
+                    self.inner.replace((key, next_delta_writer)).unwrap();
                 self.batches.add_unfinished_delta_writer(
                     prev_delta_writer,
                     start_key..key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
index 90455fd0ca..ea354fc716 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -766,7 +766,7 @@ mod tests {
                     rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
                     Ok((dst, len))
                 }
-                Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
+                Err(e) => Err(std::io::Error::other(e)),
             }
         }
     }
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 76cdddd06a..55db9fe06a 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -59,6 +59,7 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub(crate) enum IteratorWrapper<'a> {
     NotLoaded {
         ctx: &'a RequestContext,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ca34cbaf48..b10409689c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1039,6 +1039,7 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum ImageLayerCreationOutcome {
     /// We generated an image layer
     Generated {
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index d5dc9666ce..be1b55ffa3 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -302,6 +302,7 @@ pub struct UploadQueueStoppedDeletable {
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub enum UploadQueueStopped {
     Deletable(UploadQueueStoppedDeletable),
     Uninitialized,
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 8263e5aa2a..d6a7406f67 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -425,12 +425,7 @@ impl CancelClosure {
             &mut mk_tls,
             &self.hostname,
         )
-        .map_err(|e| {
-            CancelError::IO(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                e.to_string(),
-            ))
-        })?;
+        .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?;
 
         self.cancel_token.cancel_query_raw(socket, tls).await?;
         debug!("query was cancelled");
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 2c3e70138d..2268e60d25 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -568,7 +568,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
 ) -> auth::Backend<'static, ComputeCredentials> {
-    let user_info = auth::Backend::ControlPlane(
+    auth::Backend::ControlPlane(
         MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
         ComputeCredentials {
             info: ComputeUserInfo {
@@ -578,8 +578,7 @@ fn helper_create_connect_info(
             },
             keys: ComputeCredentialKeys::Password("password".into()),
         },
-    );
-    user_info
+    )
 }
 
 fn config() -> ComputeConfig {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 77b548cc43..42a3ea17a2 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -47,6 +47,7 @@ impl ConnInfo {
 }
 
 #[derive(Clone)]
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub(crate) enum ClientDataEnum {
     Remote(ClientDataRemote),
     Local(ClientDataLocal),
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index e6a7ade9f2..b7ba28f435 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -138,6 +138,7 @@ impl Drop for WriteGuardSharedState<'_> {
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub enum StateSK {
     Loaded(SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>),
     Offloaded(Box<TimelineState<control_file::FileStorage>>),
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 06ccb32d03..84c636daf6 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -35,7 +35,7 @@ impl Manager {
         next_event: &Option<tokio::time::Instant>,
         state: &StateSnapshot,
     ) -> bool {
-        let ready = self.backup_task.is_none()
+        self.backup_task.is_none()
             && self.recovery_task.is_none()
             && self.wal_removal_task.is_none()
             && self.partial_backup_task.is_none()
@@ -61,8 +61,7 @@ impl Manager {
                 .unwrap()
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
-                == self.last_removed_segno + 1;
-        ready
+                == self.last_removed_segno + 1
     }
 
     /// Evict the timeline to remote storage. Returns whether the eviction was successful.
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index f1bd7ba708..a7e0c986e6 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -96,6 +96,7 @@ enum Message {
 
 impl Message {
     /// Convert proto message to internal message.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
         match proto_msg.r#type() {
             MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
@@ -127,6 +128,7 @@ impl Message {
     }
 
     /// Get the tenant_timeline_id from the message.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
         match self {
             Message::SafekeeperTimelineInfo(msg) => Ok(msg
@@ -185,6 +187,7 @@ enum SubscriptionKey {
 
 impl SubscriptionKey {
     /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
         match key {
             ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
@@ -195,6 +198,7 @@ impl SubscriptionKey {
     }
 
     /// Parse from FilterTenantTimelineId
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn from_proto_filter_tenant_timeline_id(
         opt: Option<&FilterTenantTimelineId>,
     ) -> Result<Self, Status> {
@@ -385,6 +389,7 @@ impl Registry {
     }
 
     /// Send msg to relevant subscribers.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
         PROCESSED_MESSAGES_TOTAL.inc();
 
@@ -436,6 +441,7 @@ struct Publisher {
 
 impl Publisher {
     /// Send msg to relevant subscribers.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
         self.registry.send_msg(msg)
     }
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 55d411f607..7b36f5e948 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -79,6 +79,7 @@ impl BrokerClientChannel {
 }
 
 // parse variable length bytes from protobuf
+#[allow(clippy::result_large_err, reason = "TODO")]
 pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTimelineId, Status> {
     let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id)
         .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?;

From 66f80e77ba418c6ecab1016b6b8f13beff9f3ce3 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 9 Apr 2025 17:32:19 +0100
Subject: [PATCH 088/140] tests/performance: reconcile until idle before
 benchmark (#11435)

We'd like to run benchmarks starting from a steady state. To this end,
do a reconciliation round before proceeding with the benchmark.

This is useful for benchmarks that use tenant dir snapshots since a
non-standard tenant configuration is used to generate the snapshot. The
storage controller is not aware of the non default tenant configuration
and will reconcile while the bench is running.
---
 test_runner/performance/pageserver/util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 7a6d88f79c..b50659defc 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -40,6 +40,8 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
             for layer in info.historic_layers:
                 assert not layer.remote
 
+    env.storage_controller.reconcile_until_idle(timeout_secs=60)
+
     log.info("ready")
 
 
From afd34291ca5152c151e067c12af50b98a65d6832 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Apr 2025 11:41:29 -0500
Subject: [PATCH 089/140] Make neon_local token generation generic over claims
 (#11507)

Instead of encoding a certain structure for claims, let's allow the
caller to specify what claims be encoded.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 control_plane/src/local_env.rs | 4 ++--
 libs/utils/src/auth.rs         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2616afbb16..8e2a110366 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -15,7 +15,7 @@ use clap::ValueEnum;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use utils::auth::{Claims, encode_from_key_file};
+use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 
 use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
@@ -757,7 +757,7 @@ impl LocalEnv {
     }
 
     // this function is used only for testing purposes in CLI e g generate tokens during init
-    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
+    pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
         let private_key_path = self.get_private_key_path();
         let key_data = fs::read(private_key_path)?;
         encode_from_key_file(claims, &key_data)
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index cc5b0b1d13..db4fc5685c 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth {
 }
 
 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
+pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
     let key = EncodingKey::from_ed_pem(key_data)?;
     Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
 }

From 1c237d0c6d5eca08134f18282aa352e4d67bc4c0 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Apr 2025 11:58:44 -0500
Subject: [PATCH 090/140] Move compute_ctl claims struct into public API
 (#11505)

This is preparatory work for teaching neon_local to pass the
Authorization header to compute_ctl.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/middleware/authorize.rs | 15 +++++++--------
 libs/compute_api/src/requests.rs               |  8 ++++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index 89d55e1af3..ee3a5cb953 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -6,20 +6,15 @@ use axum_extra::{
     TypedHeader,
     headers::{Authorization, authorization::Bearer},
 };
+use compute_api::requests::ComputeClaims;
 use futures::future::BoxFuture;
 use http::{Request, Response, StatusCode};
 use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
-use serde::Deserialize;
 use tower_http::auth::AsyncAuthorizeRequest;
 use tracing::warn;
 
 use crate::http::{JsonResponse, extract::RequestId};
 
-#[derive(Clone, Debug, Deserialize)]
-pub(in crate::http) struct Claims {
-    compute_id: String,
-}
-
 #[derive(Clone, Debug)]
 pub(in crate::http) struct Authorize {
     compute_id: String,
@@ -112,7 +107,11 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
 
 impl Authorize {
     /// Verify the token using the JSON Web Key set and return the token data.
-    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
+    fn verify(
+        jwks: &JwkSet,
+        token: &str,
+        validation: &Validation,
+    ) -> Result<TokenData<ComputeClaims>> {
         for jwk in jwks.keys.iter() {
             let decoding_key = match DecodingKey::from_jwk(jwk) {
                 Ok(key) => key,
@@ -127,7 +126,7 @@ impl Authorize {
                 }
             };
 
-            match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
+            match jsonwebtoken::decode::<ComputeClaims>(token, &decoding_key, validation) {
                 Ok(data) => return Ok(data),
                 Err(e) => {
                     warn!(
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index 3fbdfcf83f..98f2fc297c 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -5,6 +5,14 @@ use crate::privilege::Privilege;
 use crate::responses::ComputeCtlConfig;
 use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
 
+/// When making requests to the `compute_ctl` external HTTP server, the client
+/// must specify a set of claims in `Authorization` header JWTs such that
+/// `compute_ctl` can authorize the request.
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct ComputeClaims {
+    pub compute_id: String,
+}
+
 /// Request of the /configure API
 ///
 /// We now pass only `spec` in the configuration request, but later we can

From af12647b9dc29811bbd31716d6ccd8b52809a20a Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 9 Apr 2025 19:11:00 +0200
Subject: [PATCH 091/140] large tenant oltp benchmark: reindex with downtime
 (remove concurrently) (#11498)

## Problem

our large oltp benchmark runs very long - we want to remove the duration
of the reindex step.
we don't run concurrent workload anyhow but added "concurrently" only to
have a "prod-like" approach. But if it just doubles the time we report
because it requires two instead of one full table scan we can remove it

## Summary of changes

remove keyword concurrently from the reindex step
---
 .../performance/test_perf_oltp_large_tenant.py      | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py
index 957a4ec796..b45394d627 100644
--- a/test_runner/performance/test_perf_oltp_large_tenant.py
+++ b/test_runner/performance/test_perf_oltp_large_tenant.py
@@ -145,11 +145,14 @@ def run_database_maintenance(env: PgCompare):
                 END $$;
                 """
             )
-
-            log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction")
-            with env.zenbenchmark.record_duration("reindex concurrently"):
-                cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;")
-            log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction")
+            # in production a customer would likely use reindex concurrently
+            # but for our test we don't care about the downtime
+            # and it would just about double the time we report in the test
+            # because we need one more table scan for each index
+            log.info("start REINDEX TABLE transaction.transaction")
+            with env.zenbenchmark.record_duration("reindex"):
+                cur.execute("REINDEX TABLE transaction.transaction;")
+            log.info("finished REINDEX TABLE transaction.transaction")
 
 
 @pytest.mark.parametrize("custom_scripts", get_custom_scripts())

From ec66b788e2c66b883e9e4450a2571e25902014f2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 9 Apr 2025 14:01:31 -0400
Subject: [PATCH 092/140] fix(pageserver): use different walredo retry setting
 for gc-compaction (#11497)

## Problem

Not a complete fix for https://github.com/neondatabase/neon/issues/11492
but should work for a short term.

Our current retry strategy for walredo is to retry every request exactly
once. This retry doesn't make sense because it retries all requests
exactly once and each error is expected to cause process restart and
cause future requests to fail. I'll explain it with a scenario of two
threads requesting redos: one with an invalid history (that will cause
walredo to panic) and another that has a correct redo sequence.

First let's look at how we handle retries right now in
do_with_walredo_process. At the beginning of the function it will spawn
a new process if there's no existing one. Then it will continue to redo.
If the process fails, the first process that encounters the error will
remove the walredo process object from the OnceCell, so that the next
time it gets accessed, a new process will be spawned; if it is the last
one that uses the old walredo process, it will kill and wait the process
in `drop(proc)`. I'm skeptical whether this works under races but I
think this is not the root cause of the problem. In this retry handler,
if there are N requests attached to a walredo process and the i-th
request fails (panics the walredo), all other N-i requests will fail and
they need to retry so that they can access a new walredo process.

```
time       ---->
proc        A                 None   B
request 1   ^-----------------^ fail
            uses A for redo   replace with None
request 2      ^-------------------- fail
               uses A for redo
request 3             ^----------------^ fail
                      uses A for redo  last ref, wait for A to be killed
request 4                            ^---------------
                                     None, spawn new process B
```

The problem is with our retry strategy. Normally, for a system that we
want to retry on, the probability of errors for each of the requests are
uncorrelated. However, in walredo, a prior request that panics the
walredo process will cause all future walredo on that process to fail
(that's correlated).

So, back to the situation where we have 2 requests where one will
definitely fail and the other will succeed and we get the following
sequence, where retry attempts = 1,

* new walredo process A starts.
* request 1 (invalid) being processed on A and panics A, waiting for
retry, remove process A from the process object.
* request 2 (valid) being processed on A and receives pipe broken /
poisoned process error, waiting for retry, wait for A to be killed --
this very likely takes a while and cannot finish before request 1 gets
processed again
* new walredo process B starts.
* request 1 (invalid) being processed again on B and panics B, the whole
request fail.
* request 2 (valid) being processed again on B, and get a poisoned error
again.

```
time       ---->
proc        A                 None           B                    None
request 1   ^-----------------^--------------^--------------------^
            spawn A for redo  fail          spawn B for redo     fail
request 2      ^--------------------^-------------------------^------------^
               use A for redo       fail, wait to kill A      B for redo   fail again
```

In such cases, no matter how we set n_attempts, as long as the retry
count applies to all requests, this sequence is bound to fail both
requests because of how they get sequenced; while we could potentially
make request 2 successful.

There are many solutions to this -- like having a separate walredo
manager for compactions, or define which errors are retryable (i.e.,
broken pipe can be retried, while real walredo error won't be retried),
or having a exclusive big lock over the whole redo process (the current
one is very fine-grained). In this patch, we go with a simple approach:
use different retry attempts for different types of requests.

For gc-compaction, the attempt count is set to 0, so that it never
retries and consequently stops the compaction process -- no more redo
will be issued from gc-compaction. Once the walredo process gets
restarted, the normal read requests will proceed normally.

## Summary of changes

Add redo_attempt for each reconstruct value request to set different
retry policies.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 pageserver/benches/bench_walredo.rs          | 11 ++++-
 pageserver/src/tenant.rs                     |  8 ++--
 pageserver/src/tenant/timeline.rs            | 50 ++++++++++----------
 pageserver/src/tenant/timeline/compaction.rs |  6 +--
 pageserver/src/walredo.rs                    | 27 ++++++++++-
 5 files changed, 66 insertions(+), 36 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 77b3f90b3e..215682d90c 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -65,7 +65,7 @@ use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use once_cell::sync::Lazy;
 use pageserver::config::PageServerConf;
-use pageserver::walredo::PostgresRedoManager;
+use pageserver::walredo::{PostgresRedoManager, RedoAttemptType};
 use pageserver_api::key::Key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::TenantShardId;
@@ -223,7 +223,14 @@ impl Request {
 
         // TODO: avoid these clones
         manager
-            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
+            .request_redo(
+                *key,
+                *lsn,
+                base_img.clone(),
+                records.clone(),
+                *pg_version,
+                RedoAttemptType::ReadPage,
+            )
             .await
             .context("request_redo")
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2ac2fd0b81..d3623fc3b9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -100,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walingest::WalLagCooldown;
-use crate::walredo::PostgresRedoManager;
+use crate::walredo::{PostgresRedoManager, RedoAttemptType};
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo};
 
 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
@@ -473,15 +473,16 @@ impl WalRedoManager {
         base_img: Option<(Lsn, bytes::Bytes)>,
         records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
         pg_version: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
             Self::Prod(_, mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
+                mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
                     .await
             }
             #[cfg(test)]
             Self::Test(mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
+                mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
                     .await
             }
         }
@@ -5879,6 +5880,7 @@ pub(crate) mod harness {
             base_img: Option<(Lsn, Bytes)>,
             records: Vec<(Lsn, NeonWalRecord)>,
             _pg_version: u32,
+            _redo_attempt_type: RedoAttemptType,
         ) -> Result<Bytes, walredo::Error> {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
             if records_neon {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b10409689c..8a4a6f4b40 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
 use crate::PERF_TRACE_TARGET;
+use crate::walredo::RedoAttemptType;
 use anyhow::{Context, Result, anyhow, bail, ensure};
 use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
@@ -1293,6 +1294,12 @@ impl Timeline {
         };
         reconstruct_state.read_path = read_path;
 
+        let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
+            RedoAttemptType::LegacyCompaction
+        } else {
+            RedoAttemptType::ReadPage
+        };
+
         let traversal_res: Result<(), _> = {
             let ctx = RequestContextBuilder::from(ctx)
                 .perf_span(|crnt_perf_span| {
@@ -1380,7 +1387,7 @@ impl Timeline {
 
                     let walredo_deltas = converted.num_deltas();
                     let walredo_res = walredo_self
-                        .reconstruct_value(key, lsn, converted)
+                        .reconstruct_value(key, lsn, converted, redo_attempt_type)
                         .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                             info_span!(
                                 target: PERF_TRACE_TARGET,
@@ -6343,37 +6350,21 @@ impl Timeline {
 
     /// Reconstruct a value, using the given base image and WAL records in 'data'.
     async fn reconstruct_value(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, false)
-            .await
-    }
-
-    /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
-    /// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
-    async fn reconstruct_value_wo_critical_error(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, true)
-            .await
-    }
-
-    async fn reconstruct_value_inner(
         &self,
         key: Key,
         request_lsn: Lsn,
         mut data: ValueReconstructState,
-        no_critical_error: bool,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, PageReconstructError> {
         // Perform WAL redo if needed
         data.records.reverse();
 
+        let fire_critical_error = match redo_attempt_type {
+            RedoAttemptType::ReadPage => true,
+            RedoAttemptType::LegacyCompaction => true,
+            RedoAttemptType::GcCompaction => false,
+        };
+
         // If we have a page image, and no WAL, we're all set
         if data.records.is_empty() {
             if let Some((img_lsn, img)) = &data.img {
@@ -6420,13 +6411,20 @@ impl Timeline {
                     .as_ref()
                     .context("timeline has no walredo manager")
                     .map_err(PageReconstructError::WalRedo)?
-                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
+                    .request_redo(
+                        key,
+                        request_lsn,
+                        data.img,
+                        data.records,
+                        self.pg_version,
+                        redo_attempt_type,
+                    )
                     .await;
                 let img = match res {
                     Ok(img) => img,
                     Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                     Err(walredo::Error::Other(err)) => {
-                        if !no_critical_error {
+                        if fire_critical_error {
                             critical!("walredo failure during page reconstruction: {err:?}");
                         }
                         return Err(PageReconstructError::WalRedo(
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8403c0a7d9..5f969a4e77 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -16,6 +16,8 @@ use super::{
     Timeline,
 };
 
+use crate::tenant::timeline::DeltaEntry;
+use crate::walredo::RedoAttemptType;
 use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
@@ -2411,7 +2413,7 @@ impl Timeline {
                     lsn_split_points[i]
                 };
                 let img = self
-                    .reconstruct_value_wo_critical_error(key, request_lsn, state)
+                    .reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction)
                     .await?;
                 Some((request_lsn, img))
             } else {
@@ -3909,8 +3911,6 @@ impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
     }
 }
 
-use crate::tenant::timeline::DeltaEntry;
-
 impl CompactionLayer<Key> for ResidentDeltaLayer {
     fn key_range(&self) -> &Range<Key> {
         &self.0.layer_desc().key_range
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 22d8d83811..ed8a954369 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -136,6 +136,16 @@ macro_rules! bail {
     }
 }
 
+#[derive(Debug, Clone, Copy)]
+pub enum RedoAttemptType {
+    /// Used for the read path. Will fire critical errors and retry twice if failure.
+    ReadPage,
+    // Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure.
+    LegacyCompaction,
+    // Used for gc compaction. Will not fire critical errors and not retry.
+    GcCompaction,
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -156,11 +166,18 @@ impl PostgresRedoManager {
         base_img: Option<(Lsn, Bytes)>,
         records: Vec<(Lsn, NeonWalRecord)>,
         pg_version: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, Error> {
         if records.is_empty() {
             bail!("invalid WAL redo request with no records");
         }
 
+        let max_retry_attempts = match redo_attempt_type {
+            RedoAttemptType::ReadPage => 2,
+            RedoAttemptType::LegacyCompaction => 1,
+            RedoAttemptType::GcCompaction => 0,
+        };
+
         let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
         let mut img = base_img.map(|p| p.1);
         let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
@@ -180,6 +197,7 @@ impl PostgresRedoManager {
                         &records[batch_start..i],
                         self.conf.wal_redo_timeout,
                         pg_version,
+                        max_retry_attempts,
                     )
                     .await
                 };
@@ -201,6 +219,7 @@ impl PostgresRedoManager {
                 &records[batch_start..],
                 self.conf.wal_redo_timeout,
                 pg_version,
+                max_retry_attempts,
             )
             .await
         }
@@ -424,11 +443,11 @@ impl PostgresRedoManager {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
         pg_version: u32,
+        max_retry_attempts: u32,
     ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
         let (rel, blknum) = key.to_rel_block().context("invalid record")?;
-        const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
             let base_img = &base_img;
@@ -486,7 +505,7 @@ impl PostgresRedoManager {
                 info!(n_attempts, "retried walredo succeeded");
             }
             n_attempts += 1;
-            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
+            if n_attempts > max_retry_attempts || result.is_ok() {
                 return result;
             }
         }
@@ -560,6 +579,7 @@ mod tests {
 
     use super::PostgresRedoManager;
     use crate::config::PageServerConf;
+    use crate::walredo::RedoAttemptType;
 
     #[tokio::test]
     async fn test_ping() {
@@ -593,6 +613,7 @@ mod tests {
                 None,
                 short_records(),
                 14,
+                RedoAttemptType::ReadPage,
             )
             .instrument(h.span())
             .await
@@ -621,6 +642,7 @@ mod tests {
                 None,
                 short_records(),
                 14,
+                RedoAttemptType::ReadPage,
             )
             .instrument(h.span())
             .await
@@ -642,6 +664,7 @@ mod tests {
                 None,
                 short_records(),
                 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
+                RedoAttemptType::ReadPage,
             )
             .instrument(h.span())
             .await

From 2c21a65b0b3776b2ed938fb2404f99ea312df148 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 9 Apr 2025 14:07:58 -0400
Subject: [PATCH 093/140] feat(pageserver): add gc-compaction
 time-to-first-item stats (#11475)

## Problem

In some cases gc-compaction doesn't respond to the L0 compaction yield
notifier. I suspect it's stuck on getting the first item, and if so, we
probably need to let L0 yield notifier preempt `next_with_trace`.

## Summary of changes

- Add `time_to_first_kv_pair` to gc-compaction statistics.
- Inverse the ratio so that smaller ratio -> better compaction ratio.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 30 +++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5f969a4e77..8e3be8e7f4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,7 +7,7 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 
 use super::layer_manager::LayerManager;
 use super::{
@@ -821,15 +821,16 @@ pub struct CompactionStatistics {
     time_acquire_lock_secs: f64,
     time_analyze_secs: f64,
     time_download_layer_secs: f64,
+    time_to_first_kv_pair_secs: f64,
     time_main_loop_secs: f64,
     time_final_phase_secs: f64,
     time_total_secs: f64,
 
     // Summary
-    /// Ratio of the key-value size before/after gc-compaction.
-    uncompressed_size_ratio: f64,
-    /// Ratio of the physical size before/after gc-compaction.
-    physical_size_ratio: f64,
+    /// Ratio of the key-value size after/before gc-compaction.
+    uncompressed_retention_ratio: f64,
+    /// Ratio of the physical size after/before gc-compaction.
+    compressed_retention_ratio: f64,
 }
 
 impl CompactionStatistics {
@@ -898,15 +899,15 @@ impl CompactionStatistics {
     fn finalize(&mut self) {
         let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
         let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
-        self.uncompressed_size_ratio =
-            original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
+        self.uncompressed_retention_ratio =
+            produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0
         let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
         let produced_physical_size = self.image_layer_produced.size
             + self.delta_layer_produced.size
             + self.image_layer_discarded.size
             + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
-        self.physical_size_ratio =
-            original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
+        self.compressed_retention_ratio =
+            produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0
     }
 }
 
@@ -3034,7 +3035,7 @@ impl Timeline {
         .map_err(CompactionError::Other)?;
 
         let time_download_layer = timer.elapsed();
-        let timer = Instant::now();
+        let mut timer = Instant::now();
 
         // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
@@ -3109,6 +3110,7 @@ impl Timeline {
         // Actually, we can decide not to write to the image layer at all at this point because
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
+        let mut time_to_first_kv_pair = None;
 
         while let Some(((key, lsn, val), desc)) = merge_iter
             .next_with_trace()
@@ -3116,6 +3118,11 @@ impl Timeline {
             .context("failed to get next key-value pair")
             .map_err(CompactionError::Other)?
         {
+            if time_to_first_kv_pair.is_none() {
+                time_to_first_kv_pair = Some(timer.elapsed());
+                timer = Instant::now();
+            }
+
             if cancel.is_cancelled() {
                 return Err(CompactionError::ShuttingDown);
             }
@@ -3451,6 +3458,9 @@ impl Timeline {
         let time_final_phase = timer.elapsed();
 
         stat.time_final_phase_secs = time_final_phase.as_secs_f64();
+        stat.time_to_first_kv_pair_secs = time_to_first_kv_pair
+            .unwrap_or(Duration::ZERO)
+            .as_secs_f64();
         stat.time_main_loop_secs = time_main_loop.as_secs_f64();
         stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
         stat.time_download_layer_secs = time_download_layer.as_secs_f64();

From 63ee8e218195e42daa305085ad847a38ceda93cb Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Apr 2025 21:03:49 +0200
Subject: [PATCH 094/140] test_runner: ignore `.___temp` files in
 `evict_random_layers` (#11509)

## Problem

`test_location_conf_churn` often fails with `neither image nor delta
layer`, but doesn't say what the file actually is. However, past local
failures have indicated that it might be `.___temp` files.

Touches https://github.com/neondatabase/neon/issues/11348.

## Summary of changes

Ignore `.___temp` files when evicting local layers, and include the file
name in the error message.
---
 test_runner/fixtures/pageserver/common_types.py  | 2 +-
 test_runner/regress/test_pageserver_secondary.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py
index 0e068db593..0a92883e96 100644
--- a/test_runner/fixtures/pageserver/common_types.py
+++ b/test_runner/fixtures/pageserver/common_types.py
@@ -105,7 +105,7 @@ def parse_layer_file_name(file_name: str) -> LayerName:
     except InvalidFileName:
         pass
 
-    raise InvalidFileName("neither image nor delta layer")
+    raise InvalidFileName(f"neither image nor delta layer: {file_name}")
 
 
 def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn):
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index c73a592d98..d03d05d33d 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -61,7 +61,7 @@ def evict_random_layers(
     )
     client = pageserver.http_client()
     for layer in initial_local_layers:
-        if "ephemeral" in layer.name or "temp_download" in layer.name:
+        if "ephemeral" in layer.name or "temp_download" in layer.name or ".___temp" in layer.name:
             continue
 
         layer_name = parse_layer_file_name(layer.name)

From 405a17bf0b3bf2aa646bf5be561112408733e36c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 9 Apr 2025 16:57:50 -0400
Subject: [PATCH 095/140] fix(pageserver): ensure gc-compaction gets preempted
 by L0 (#11512)

## Problem

Part of #9114

## Summary of changes

Gc-compaction flag was not correctly set, causing it not getting
preempted by L0.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |  1 +
 pageserver/src/tenant/timeline/compaction.rs |  3 +
 test_runner/regress/test_compaction.py       | 59 +++++++++++++++++++-
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 200b91fc82..9bb761dc48 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2274,6 +2274,7 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
         flags |= CompactFlags::DryRun;
     }
+    // Manual compaction does not yield for L0.
 
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8e3be8e7f4..7b1969f209 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -317,6 +317,9 @@ impl GcCompactionQueue {
                     flags: {
                         let mut flags = EnumSet::new();
                         flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                        if timeline.get_compaction_l0_first() {
+                            flags |= CompactFlags::YieldForL0;
+                        }
                         flags
                     },
                     sub_compaction: true,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 6789939e0c..087fafb327 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -38,12 +38,34 @@ PREEMPT_COMPACTION_TENANT_CONF = {
     "compaction_target_size": 1024**2,
     "image_creation_threshold": 1,
     "image_creation_preempt_threshold": 1,
-    # compact more frequently
+    # Compact more frequently
     "compaction_threshold": 3,
     "compaction_upper_limit": 6,
     "lsn_lease_length": "0s",
 }
 
+PREEMPT_GC_COMPACTION_TENANT_CONF = {
+    "gc_period": "5s",
+    "compaction_period": "5s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 10000,  # Do not create image layers at all
+    "image_creation_preempt_threshold": 10000,
+    # Compact more frequently
+    "compaction_threshold": 3,
+    "compaction_upper_limit": 6,
+    "lsn_lease_length": "0s",
+    # Enable gc-compaction
+    "gc_compaction_enabled": "true",
+    "gc_compaction_initial_threshold_kb": 1024,  # At a small threshold
+    "gc_compaction_ratio_percent": 1,
+    # No PiTR interval and small GC horizon
+    "pitr_interval": "0s",
+    "gc_horizon": f"{1024**2}",
+}
+
 
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
@@ -165,6 +187,41 @@ def test_pageserver_compaction_preempt(
     env.pageserver.assert_log_contains("resuming image layer creation")
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_gc_compaction_preempt(
+    neon_env_builder: NeonEnvBuilder,
+):
+    # Ideally we should be able to do unit tests for this, but we need real Postgres
+    # WALs in order to do unit testing...
+
+    conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 200000
+    churn_rounds = 10
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        workload.validate(env.pageserver.id)
+    ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    # ensure gc_compaction gets preempted and then resumed
+    env.pageserver.assert_log_contains("preempt gc-compaction")
+
+
 @skip_in_debug_build("only run with release build")
 @pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
 @pytest.mark.parametrize(

From af0be11503dc94e78b9f664d1d1096004eab9a4a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 9 Apr 2025 17:41:11 -0400
Subject: [PATCH 096/140] fix(pageserver): ensure gc-compaction gets preempted
 by L0 (#11512)

## Problem

Part of #9114

## Summary of changes

Gc-compaction flag was not correctly set, causing it not getting
preempted by L0.

Signed-off-by: Alex Chi Z <chi@neon.tech>

From a04e33ceb638a3ee5fef8d642b57ffc3a4543c98 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Apr 2025 17:39:54 -0500
Subject: [PATCH 097/140] Remove --spec-json argument from compute_ctl (#11510)

It isn't used by the production control plane or neon_local. The removal
simplifies compute spec logic just a little bit more since we can remove
any notion of whether we should allow live reconfigurations.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs       | 28 +++++++---------------
 compute_tools/src/compute.rs               | 14 -----------
 compute_tools/src/http/routes/configure.rs |  7 ------
 3 files changed, 9 insertions(+), 40 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index da11ac2860..4796a07d92 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -118,16 +118,18 @@ struct Cli {
     #[arg(long)]
     pub set_disk_quota_for_fs: Option<String>,
 
-    #[arg(short = 's', long = "spec", group = "spec")]
-    pub spec_json: Option<String>,
-
     #[arg(short = 'S', long, group = "spec-path")]
     pub spec_path: Option<OsString>,
 
     #[arg(short = 'i', long, group = "compute-id")]
     pub compute_id: String,
 
-    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
+    #[arg(
+        short = 'p',
+        long,
+        conflicts_with = "spec-path",
+        value_name = "CONTROL_PLANE_API_BASE_URL"
+    )]
     pub control_plane_uri: Option<String>,
 }
 
@@ -172,7 +174,6 @@ fn main() -> Result<()> {
             cgroup: cli.cgroup,
             #[cfg(target_os = "linux")]
             vm_monitor_addr: cli.vm_monitor_addr,
-            live_config_allowed: cli_spec.live_config_allowed,
         },
         cli_spec.spec,
         cli_spec.compute_ctl_config,
@@ -201,23 +202,12 @@ async fn init() -> Result<()> {
 }
 
 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
-    // First, try to get cluster spec from the cli argument
-    if let Some(ref spec_json) = cli.spec_json {
-        info!("got spec from cli argument {}", spec_json);
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_str(spec_json)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-            live_config_allowed: false,
-        });
-    }
-
-    // Second, try to read it from the file if path is provided
+    // First, read spec from the path if provided
     if let Some(ref spec_path) = cli.spec_path {
         let file = File::open(Path::new(spec_path))?;
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_reader(file)?),
             compute_ctl_config: ComputeCtlConfig::default(),
-            live_config_allowed: true,
         });
     }
 
@@ -225,11 +215,12 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
         panic!("must specify --control-plane-uri");
     };
 
+    // If the spec wasn't provided in the CLI arguments, then retrieve it from
+    // the control plane
     match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
         Ok(resp) => Ok(CliSpecParams {
             spec: resp.0,
             compute_ctl_config: resp.1,
-            live_config_allowed: true,
         }),
         Err(e) => {
             error!(
@@ -247,7 +238,6 @@ struct CliSpecParams {
     spec: Option<ComputeSpec>,
     #[allow(dead_code)]
     compute_ctl_config: ComputeCtlConfig,
-    live_config_allowed: bool,
 }
 
 fn deinit_and_exit(exit_code: Option<i32>) -> ! {
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 9dfcde1dbc..ad8925e7ab 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -93,20 +93,6 @@ pub struct ComputeNodeParams {
 
     /// the address of extension storage proxy gateway
     pub ext_remote_storage: Option<String>,
-
-    /// We should only allow live re- / configuration of the compute node if
-    /// it uses 'pull model', i.e. it can go to control-plane and fetch
-    /// the latest configuration. Otherwise, there could be a case:
-    /// - we start compute with some spec provided as argument
-    /// - we push new spec and it does reconfiguration
-    /// - but then something happens and compute pod / VM is destroyed,
-    ///   so k8s controller starts it again with the **old** spec
-    ///
-    /// and the same for empty computes:
-    /// - we started compute without any spec
-    /// - we push spec and it does configuration
-    /// - but then it is restarted without any spec again
-    pub live_config_allowed: bool,
 }
 
 /// Compute node info shared across several `compute_ctl` threads.
diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs
index 3c5a6a6d41..f7a19da611 100644
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -22,13 +22,6 @@ pub(in crate::http) async fn configure(
     State(compute): State<Arc<ComputeNode>>,
     request: Json<ConfigurationRequest>,
 ) -> Response {
-    if !compute.params.live_config_allowed {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "live configuration is not allowed for this compute node".to_string(),
-        );
-    }
-
     let pspec = match ParsedSpec::try_from(request.spec.clone()) {
         Ok(p) => p,
         Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),

From 8a72e6f8884c975c05457a280af2f02056360a6e Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Thu, 10 Apr 2025 12:45:17 +0400
Subject: [PATCH 098/140] pageserver: add enable_tls_page_service_api (#11508)

## Problem
Page service doesn't use TLS for incoming requests.
- Closes: https://github.com/neondatabase/cloud/issues/27236

## Summary of changes
- Add option `enable_tls_page_service_api` to pageserver config
- Propagate `tls_server_config` to `page_service` if the option is
enabled

No integration tests for now because I didn't find out how to call page
service API from python and AFAIK computes don't support TLS yet
---
 libs/pageserver_api/src/config.rs |  2 ++
 pageserver/src/bin/pageserver.rs  | 36 ++++++++++++++++++++++---------
 pageserver/src/config.rs          |  7 ++++++
 pageserver/src/page_service.rs    |  8 ++++++-
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 8f56d60a4a..bd9f7efb7f 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -180,6 +180,7 @@ pub struct ConfigToml {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub generate_unarchival_heatmap: Option<bool>,
     pub tracing: Option<Tracing>,
+    pub enable_tls_page_service_api: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -631,6 +632,7 @@ impl Default for ConfigToml {
             load_previous_heatmap: None,
             generate_unarchival_heatmap: None,
             tracing: None,
+            enable_tls_page_service_api: false,
         }
     }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 54fecee588..2740f81758 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -452,6 +452,23 @@ fn start_pageserver(
     info!("Using auth for http API: {:#?}", conf.http_auth_type);
     info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
 
+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
+    {
+        let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            &conf.ssl_key_file,
+            &conf.ssl_cert_file,
+            conf.ssl_cert_reload_period,
+        ))?;
+
+        let server_config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(resolver);
+
+        Some(Arc::new(server_config))
+    } else {
+        None
+    };
+
     match var("NEON_AUTH_TOKEN") {
         Ok(v) => {
             info!("Loaded JWT token for authentication with Safekeeper");
@@ -670,17 +687,11 @@ fn start_pageserver(
 
         let https_task = match https_listener {
             Some(https_listener) => {
-                let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
-                    &conf.ssl_key_file,
-                    &conf.ssl_cert_file,
-                    conf.ssl_cert_reload_period,
-                ))?;
+                let tls_server_config = tls_server_config
+                    .clone()
+                    .expect("tls_server_config is set earlier if https is enabled");
 
-                let server_config = rustls::ServerConfig::builder()
-                    .with_no_client_auth()
-                    .with_cert_resolver(resolver);
-
-                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
 
                 let server =
                     http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -736,6 +747,11 @@ fn start_pageserver(
             tokio::net::TcpListener::from_std(pageserver_listener)
                 .context("create tokio listener")?
         },
+        if conf.enable_tls_page_service_api {
+            tls_server_config
+        } else {
+            None
+        },
     );
 
     // All started up! Now just sit and wait for shutdown signal.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ccc29e59d4..26ae6af70e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -219,6 +219,11 @@ pub struct PageServerConf {
     pub generate_unarchival_heatmap: bool,
 
     pub tracing: Option<pageserver_api::config::Tracing>,
+
+    /// Enable TLS in page service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    pub enable_tls_page_service_api: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
             load_previous_heatmap,
             generate_unarchival_heatmap,
             tracing,
+            enable_tls_page_service_api,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
             page_service_pipelining,
             get_vectored_concurrent_io,
             tracing,
+            enable_tls_page_service_api,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7e3991dbdc..61f524fc29 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -105,6 +105,7 @@ pub fn spawn(
     pg_auth: Option<Arc<SwappableJwtAuth>>,
     perf_trace_dispatch: Option<Dispatch>,
     tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
 ) -> Listener {
     let cancel = CancellationToken::new();
     let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +125,7 @@ pub fn spawn(
             perf_trace_dispatch,
             tcp_listener,
             conf.pg_auth_type,
+            tls_config,
             conf.page_service_pipelining.clone(),
             libpq_ctx,
             cancel.clone(),
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
     perf_trace_dispatch: Option<Dispatch>,
     listener: tokio::net::TcpListener,
     auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     pipelining_config: PageServicePipeliningConfig,
     listener_ctx: RequestContext,
     listener_cancel: CancellationToken,
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
                     local_auth,
                     socket,
                     auth_type,
+                    tls_config.clone(),
                     pipelining_config.clone(),
                     connection_ctx,
                     connections_cancel.child_token(),
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     pipelining_config: PageServicePipeliningConfig,
     connection_ctx: RequestContext,
     cancel: CancellationToken,
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
         cancel.clone(),
         gate_guard,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
 
     match pgbackend.run(&mut conn_handler, &cancel).await {
         Ok(()) => {

From fae7528adb7a497d04a37f61aeccbbfb8e207f9e Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 10 Apr 2025 12:55:37 +0300
Subject: [PATCH 099/140] walproposer: make it aware of membership (#11407)

## Problem

Walproposer should get elected and commit WAL on safekeepers specified
by the membership configuration.

## Summary of changes

- Add to wp `members_safekeepers` and `new_members_safekeepers` arrays
mapping configuration members to connection slots. Establish this
mapping (by node id) when safekeeper sends greeting, giving its id and
when mconf becomes known / changes.
- Add to TermsCollected, VotesCollected,
GetAcknowledgedByQuorumWALPosition membership aware logic. Currently it
partially duplicates existing one, but we'll drop the latter eventually.
- In python, rename Configuration to MembershipConfiguration for
clarity.
- Add test_quorum_sanity testing new logic.

ref https://github.com/neondatabase/neon/issues/10851
---
 pgxn/neon/walproposer.c                       | 459 ++++++++++++++++--
 pgxn/neon/walproposer.h                       |  15 +-
 test_runner/fixtures/neon_fixtures.py         |  51 +-
 test_runner/fixtures/safekeeper/http.py       |  28 +-
 test_runner/regress/test_wal_acceptor.py      |  40 +-
 .../regress/test_wal_acceptor_async.py        | 173 +++++--
 6 files changed, 644 insertions(+), 122 deletions(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 0336d63e8d..6b133e4dc4 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 	wp->state = WPS_COLLECTING_TERMS;
+	wp->mconf.generation = INVALID_GENERATION;
+	wp->mconf.members.len = 0;
+	wp->mconf.new_members.len = 0;
 
 	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
 
@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 
 	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
 		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
+		wp_log(FATAL, "enabling generations requires protocol version 3");
 	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
 
 	/* Fill the greeting package */
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
 static bool
 WalProposerGenerationsEnabled(WalProposer *wp)
 {
-	return wp->safekeepers_generation != 0;
+	return wp->safekeepers_generation != INVALID_GENERATION;
 }
 
 /*
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }
 
+/*
+ * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
+ * members_safekeepers & new_members_safekeepers to sk.
+ */
+static void
+UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
+{
+	/* members_safekeepers etc are fixed size, sanity check mconf size */
+	if (wp->mconf.members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
+	if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
+
+	/* node id is not known until greeting is received */
+	if (sk->state < SS_WAIT_VOTING)
+		return;
+
+	/* 0 is assumed to be invalid node id, should never happen */
+	if (sk->greetResponse.nodeId == 0)
+	{
+		wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
+		return;
+	}
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.members.m[i];
+
+		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			/*
+			 * If mconf or list of safekeepers to connect to changed (the
+			 * latter always currently goes through restart though),
+			 * ResetMemberSafekeeperPtrs is expected to be called before
+			 * UpdateMemberSafekeeperPtr. So, other value suggests that we are
+			 * connected to the same sk under different host name, complain
+			 * about that.
+			 */
+			if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->members_safekeepers[i] = sk;
+		}
+	}
+	/* repeat for new_members */
+	for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
+
+		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->new_members_safekeepers[i] = sk;
+		}
+	}
+}
+
+/*
+ * Reset wp->members_safekeepers & new_members_safekeepers and refill them.
+ * Called after wp changes mconf.
+ */
+static void
+ResetMemberSafekeeperPtrs(WalProposer *wp)
+{
+	memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
+			UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
+	}
+}
+
+static uint32
+MsetQuorum(MemberSet *mset)
+{
+	Assert(mset->len > 0);
+	return mset->len / 2 + 1;
+}
+
+/* Does n forms quorum in mset? */
+static bool
+MsetHasQuorum(MemberSet *mset, uint32 n)
+{
+	return n >= MsetQuorum(mset);
+}
+
+/*
+ * TermsCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_greeted = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_VOTING)
+		{
+			if (n_greeted > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_greeted++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
+	return MsetHasQuorum(mset, n_greeted);
+}
+
 /*
  * Have we received greeting from enough (quorum) safekeepers to start voting?
  */
 static bool
 TermsCollected(WalProposer *wp)
 {
-	return wp->n_connected >= wp->quorum;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		collected = wp->n_connected >= wp->quorum;
+		if (collected)
+		{
+			wp->propTerm++;
+			wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
+		}
+		return collected;
+	}
+
+	/*
+	 * With generations enabled, we start campaign only when 1) some mconf is
+	 * actually received 2) we have greetings from majority of members as well
+	 * as from majority of new_members if it exists.
+	 */
+	if (wp->mconf.generation == INVALID_GENERATION)
+		return false;
+
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset greeters: ");
+	if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset greeters: ");
+		if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp->propTerm++;
+	wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }
 
 static void
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	pfree(mconf_toml);
 
 	/*
-	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
-	 * restart wp if it started voting.
+	 * Adopt mconf of safekeepers if it is higher.
 	 */
 	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
 	{
+		/* sanity check before adopting, should never happen */
+		if (sk->greetResponse.mconf.members.len == 0)
+		{
+			wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
+		}
+
+		/*
+		 * If we at least started campaign, restart wp to get elected in the
+		 * new mconf. Note: in principle once wp is already elected
+		 * re-election is not required, but being conservative here is not
+		 * bad.
+		 *
+		 * TODO: put mconf to shmem to immediately pick it up on start,
+		 * otherwise if some safekeeper(s) misses latest mconf and gets
+		 * connected the first, it may cause redundant restarts here.
+		 *
+		 * More generally, it would be nice to restart walproposer (wiping
+		 * election state) without restarting the process. In particular, that
+		 * would allow sync-safekeepers not to die here if it intersected with
+		 * sk migration (as well as remove 1s delay).
+		 *
+		 * Note that assign_neon_safekeepers also currently restarts the
+		 * process, so during normal migration walproposer may restart twice.
+		 */
+		if (wp->state >= WPS_CAMPAIGN)
+		{
+			wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
+		}
 		MembershipConfigurationFree(&wp->mconf);
 		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		ResetMemberSafekeeperPtrs(wp);
 		/* full conf was just logged above */
 		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
 	}
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_WAIT_VOTING;
 
+	/* In greeting safekeeper sent its id; update mappings accordingly. */
+	UpdateMemberSafekeeperPtr(wp, sk);
+
 	/*
 	 * Note: it would be better to track the counter on per safekeeper basis,
 	 * but at worst walproposer would restart with 'term rejected', so leave
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		/* We're still collecting terms from the majority. */
 		wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);
 
-		/* Quorum is acquried, prepare the vote request. */
+		/* Quorum is acquired, prepare the vote request. */
 		if (TermsCollected(wp))
 		{
-			wp->propTerm++;
-			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
-
 			wp->state = WPS_CAMPAIGN;
 			wp->voteRequest.pam.tag = 'v';
 			wp->voteRequest.generation = wp->mconf.generation;
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
 					   &sk->outbuf, wp->config->proto_version);
 
 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
-		   wp->voteRequest.generation, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
+		   sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
 		return;
 
 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
 		   sk->voteResponse.voteGiven,
 		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 }
 
+/*
+ * VotesCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_votes = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_ELECTED)
+		{
+			Assert(sk->voteResponse.voteGiven);
+
+			/*
+			 * Find the highest vote. NULL check is for the legacy case where
+			 * safekeeper might be not initialized with LSN at all and return
+			 * 0 LSN in the vote response; we still want to set donor to
+			 * something in this case.
+			 */
+			if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
+				(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
+				 sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
+				wp->donor == NULL)
+			{
+				wp->donorLastLogTerm = GetLastLogTerm(sk);
+				wp->propTermStartLsn = sk->voteResponse.flushLsn;
+				wp->donor = sk;
+			}
+			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
+
+			if (n_votes > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_votes++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
+	return MsetHasQuorum(mset, n_votes);
+}
+
+
 /*
  * Checks if enough votes has been collected to get elected and if that's the
  * case finds the highest vote, setting donor, donorLastLogTerm,
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
 static bool
 VotesCollected(WalProposer *wp)
 {
-	int			n_ready = 0;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
 
 	/* assumed to be called only when not elected yet */
 	Assert(wp->state == WPS_CAMPAIGN);
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
 	wp->donorLastLogTerm = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;
 
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
 	{
-		if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
-		{
-			n_ready++;
+		int			n_ready = 0;
 
-			if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
-				(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
-				 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
 			{
-				wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
-				wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
-				wp->donor = i;
+				n_ready++;
+
+				if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
+					(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
+					 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
+					wp->donor == NULL)
+				{
+					wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
+					wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
+					wp->donor = &wp->safekeeper[i];
+				}
+				wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 			}
-			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 		}
+		collected = n_ready >= wp->quorum;
+		if (collected)
+		{
+			wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
+		}
+		return collected;
 	}
 
-	return n_ready >= wp->quorum;
+	/*
+	 * if generations are enabled we're expected to get to voting only when
+	 * mconf is established.
+	 */
+	Assert(wp->mconf.generation != INVALID_GENERATION);
+
+	/*
+	 * We must get votes from both msets if both are present.
+	 */
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset voters: ");
+	if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset voters: ");
+		if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp_log(LOG, "walproposer elected, %s", s.data);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }
 
 /*
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
 	 * that only for logical replication (and switching logical walsenders to
 	 * neon_walreader is a todo.)
 	 */
-	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
+	if (!wp->api.recovery_download(wp, wp->donor))
 	{
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
+	dth = &wp->donor->voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
 	if (dth->n_entries > 0)
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;
 
-	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		   wp->quorum,
+	wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		   wp->propTerm,
 		   LSN_FORMAT_ARGS(wp->propTermStartLsn),
-		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   wp->donor->host, wp->donor->port,
 		   LSN_FORMAT_ARGS(wp->truncateLsn));
 
 	/*
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)
 
 		readAnything = true;
 
+		/* should never happen: sk is expected to send ERROR instead */
+		if (sk->appendResponse.generation != wp->mconf.generation)
+		{
+			wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
+				   sk->greetResponse.nodeId, sk->host, sk->port,
+				   sk->appendResponse.generation, wp->mconf.generation);
+		}
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
 }
 
 /*
- * Calculate WAL position acknowledged by quorum
+ * GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
  */
 static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
 {
 	XLogRecPtr	responses[MAX_SAFEKEEPERS];
 
 	/*
-	 * Sort acknowledged LSNs
+	 * Ascending sort acknowledged LSNs.
 	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	Assert(mset->len <= MAX_SAFEKEEPERS);
+	for (uint32 i = 0; i < mset->len; i++)
 	{
+		Safekeeper *sk = msk[i];
+
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 * terms, so ignore reported LSN until it gets to propTermStartLsn.
+		 *
+		 * Note: we ignore sk state, which is ok: before first ack flushLsn is
+		 * 0, and later we just preserve value across reconnections. It would
+		 * be ok to check for SS_ACTIVE as well.
 		 */
-		responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
+		{
+			responses[i] = sk->appendResponse.flushLsn;
+		}
+		else
+		{
+			responses[i] = 0;
+		}
 	}
-	qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);
 
 	/*
-	 * Get the smallest LSN committed by quorum
+	 * And get value committed by the quorum. A way to view this: to get the
+	 * highest value committed on the quorum, in the ordered array we skip n -
+	 * n_quorum elements to get to the first (lowest) value present on all sks
+	 * of the highest quorum.
 	 */
-	return responses[wp->n_safekeepers - wp->quorum];
+	return responses[mset->len - MsetQuorum(mset)];
+}
+
+/*
+ * Calculate WAL position acknowledged by quorum, i.e. which may be regarded
+ * committed.
+ *
+ * Zero may be returned when there is no quorum of nodes recovered to term start
+ * lsn which sent feedback yet.
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+{
+	XLogRecPtr	committed;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		XLogRecPtr	responses[MAX_SAFEKEEPERS];
+
+		/*
+		 * Sort acknowledged LSNs
+		 */
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			/*
+			 * Like in Raft, we aren't allowed to commit entries from previous
+			 * terms, so ignore reported LSN until it gets to
+			 * propTermStartLsn.
+			 *
+			 * Note: we ignore sk state, which is ok: before first ack
+			 * flushLsn is 0, and later we just preserve value across
+			 * reconnections. It would be ok to check for SS_ACTIVE as well.
+			 */
+			responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		}
+		qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+
+		/*
+		 * Get the smallest LSN committed by quorum
+		 */
+		return responses[wp->n_safekeepers - wp->quorum];
+	}
+
+	committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
+	if (wp->mconf.new_members.len > 0)
+	{
+		XLogRecPtr	new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
+
+		committed = Min(committed, new_mset_committed);
+	}
+	return committed;
 }
 
 /*
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
 	int			i;
 	XLogRecPtr	donor_lsn = InvalidXLogRecPtr;
 
-	if (wp->n_votes < wp->quorum)
+	if (wp->state < WPS_ELECTED)
 	{
 		wp_log(WARNING, "UpdateDonorShmem called before elections are won");
 		return;
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
 	 * about its position immediately after election before any feedbacks are
 	 * sent.
 	 */
-	if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
+	if (wp->donor->state >= SS_WAIT_ELECTED)
 	{
-		donor = &wp->safekeeper[wp->donor];
+		donor = wp->donor;
 		donor_lsn = wp->propTermStartLsn;
 	}
 
@@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}
 
 	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
+	 * Generally sync is done when majority reached propTermStartLsn so we
+	 * committed it and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is
+	 * alive, there will be at least one safekeeper who is able to stream WAL
+	 * to pageserver to make basebackup possible. However, since at the moment
+	 * we don't have any good mechanism of defining the healthy and most
+	 * advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
 	 * (due to pageserver connecting to not-synced-safekeeper) we currently
 	 * wait for all seemingly alive safekeepers to get synced.
@@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 				n_synced++;
 		}
 
-		if (n_synced >= wp->quorum)
+		if (newCommitLsn >= wp->propTermStartLsn)
 		{
 			/* A quorum of safekeepers has been synced! */
 
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index d116bce806..648b0015ad 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
  * This and following structs pair ones in membership.rs.
  */
 typedef uint32 Generation;
+#define INVALID_GENERATION 0
 
 typedef struct SafekeeperId
 {
@@ -771,7 +772,17 @@ typedef struct WalProposer
 	/* Current walproposer membership configuration */
 	MembershipConfiguration mconf;
 
-	/* (n_safekeepers / 2) + 1 */
+	/*
+	 * Parallels mconf.members with pointers to the member's slot in
+	 * safekeepers array of connections, or NULL if such member is not
+	 * connected. Helps to avoid looking slot per id through all
+	 * .safekeepers[] when doing quorum checks.
+	 */
+	Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
+	/* As above, but for new_members. */
+	Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
+
+	/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
 	int			quorum;
 
 	/*
@@ -829,7 +840,7 @@ typedef struct WalProposer
 	term_t		donorLastLogTerm;
 
 	/* Most advanced acceptor */
-	int			donor;
+	Safekeeper *donor;
 
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d000dcb69f..ba8de1c01c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -79,7 +79,12 @@ from fixtures.remote_storage import (
     default_remote_storage,
     remote_storage_to_toml_dict,
 )
-from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.http import (
+    MembershipConfiguration,
+    SafekeeperHttpClient,
+    SafekeeperId,
+    TimelineCreateRequest,
+)
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -4839,6 +4844,50 @@ class Safekeeper(LogUtils):
 
         wait_until(paused)
 
+    @staticmethod
+    def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]:
+        return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks]
+
+    @staticmethod
+    def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]:
+        """
+        List of Safekeepers which are members in `mconf`.
+        """
+        members_ids = [m.id for m in mconf.members]
+        new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else []
+        return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids]
+
+    @staticmethod
+    def create_timeline(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        ps: NeonPageserver,
+        mconf: MembershipConfiguration,
+        members_sks: list[Safekeeper],
+    ):
+        """
+        Manually create timeline on safekeepers with given (presumably inital)
+        mconf: figure out LSN from pageserver, bake request and execute it on
+        given safekeepers.
+
+        Normally done by storcon, but some tests want to do it manually so far.
+        """
+        ps_http_cli = ps.http_client()
+        # figure out initial LSN.
+        ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
+        init_lsn = ps_timeline_detail["last_record_lsn"]
+        log.info(f"initial LSN: {init_lsn}")
+        # sk timeline creation request expects minor version
+        pg_version = ps_timeline_detail["pg_version"] * 10000
+        # create inital mconf
+        create_r = TimelineCreateRequest(
+            tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+        )
+        log.info(f"sending timeline create: {create_r.to_json()}")
+
+        for sk in members_sks:
+            sk.http_client().timeline_create(create_r)
+
 
 class NeonBroker(LogUtils):
     """An object managing storage_broker instance"""
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index e409151b76..839e985419 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -25,7 +25,7 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
-    mconf: Configuration | None
+    mconf: MembershipConfiguration | None
     term: int
     last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -78,17 +78,17 @@ class SafekeeperId:
 
 
 @dataclass
-class Configuration:
+class MembershipConfiguration:
     generation: int
     members: list[SafekeeperId]
     new_members: list[SafekeeperId] | None
 
     @classmethod
-    def from_json(cls, d: dict[str, Any]) -> Configuration:
+    def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration:
         generation = d["generation"]
         members = d["members"]
         new_members = d.get("new_members")
-        return Configuration(generation, members, new_members)
+        return MembershipConfiguration(generation, members, new_members)
 
     def to_json(self) -> str:
         return json.dumps(self, cls=EnhancedJSONEncoder)
@@ -98,7 +98,7 @@ class Configuration:
 class TimelineCreateRequest:
     tenant_id: TenantId
     timeline_id: TimelineId
-    mconf: Configuration
+    mconf: MembershipConfiguration
     # not exactly PgVersion, for example 150002 for 15.2
     pg_version: int
     start_lsn: Lsn
@@ -110,13 +110,13 @@ class TimelineCreateRequest:
 
 @dataclass
 class TimelineMembershipSwitchResponse:
-    previous_conf: Configuration
-    current_conf: Configuration
+    previous_conf: MembershipConfiguration
+    current_conf: MembershipConfiguration
 
     @classmethod
     def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
-        previous_conf = Configuration.from_json(d["previous_conf"])
-        current_conf = Configuration.from_json(d["current_conf"])
+        previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
+        current_conf = MembershipConfiguration.from_json(d["current_conf"])
         return TimelineMembershipSwitchResponse(previous_conf, current_conf)
 
 
@@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         # It is always normally not None, it is allowed only to make forward compat tests happy.
-        mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
+        mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None
         return SafekeeperTimelineStatus(
             mconf=mconf,
             term=resj["acceptor_state"]["term"],
@@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
     # Get timeline membership configuration.
-    def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
+    def get_membership(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> MembershipConfiguration:
         # make mypy happy
         return self.timeline_status(tenant_id, timeline_id).mconf  # type: ignore
 
@@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def timeline_exclude(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
     ) -> dict[str, Any]:
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude",
@@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def membership_switch(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
     ) -> TimelineMembershipSwitchResponse:
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index e3d39f9315..a9a6699e5c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -45,7 +45,7 @@ from fixtures.remote_storage import (
     s3_storage,
 )
 from fixtures.safekeeper.http import (
-    Configuration,
+    MembershipConfiguration,
     SafekeeperHttpClient,
     SafekeeperId,
     TimelineCreateRequest,
@@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
     for sk in env.safekeepers:
         sk.start()
         cli = sk.http_client()
-        mconf = Configuration(generation=0, members=[], new_members=None)
+        mconf = MembershipConfiguration(generation=0, members=[], new_members=None)
         # set start_lsn to the beginning of the first segment to allow reading
         # WAL from there (could you intidb LSN as well).
         r = TimelineCreateRequest(
@@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     sk_id_2 = SafekeeperId(11, "localhost", 5434)  # just a mock
 
     # Request to switch before timeline creation should fail.
-    init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
+    init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.membership_switch(tenant_id, timeline_id, init_conf)
 
@@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     http_cli.timeline_create(create_r)
 
     # Switch into some conf.
-    joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
+    joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
     resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
     log.info(f"joint switch resp: {resp}")
     assert resp.previous_conf.generation == 1
@@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     assert after_restart.generation == 4
 
     # Switch into non joint conf of which sk is not a member, must fail.
-    non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None)
+    non_joint_not_member = MembershipConfiguration(
+        generation=5, members=[sk_id_2], new_members=None
+    )
     with pytest.raises(requests.exceptions.HTTPError):
         resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member)
 
     # Switch into good non joint conf.
-    non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None)
+    non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None)
     resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
     log.info(f"non joint switch resp: {resp}")
     assert resp.previous_conf.generation == 4
     assert resp.current_conf.generation == 6
 
     # Switch request to lower conf should be rejected.
-    lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None)
+    lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
 
     # Now, exclude sk from the membership, timeline should be deleted.
-    excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None)
+    excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None)
     http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.timeline_status(tenant_id, timeline_id)
@@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    ps = env.pageservers[0]
-    ps_http_cli = ps.http_client()
-
-    http_clis = [sk.http_client() for sk in env.safekeepers]
-
     config_lines = [
         "neon.safekeeper_proto_version = 3",
     ]
@@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
     # expected to fail because timeline is not created on safekeepers
     with pytest.raises(Exception, match=r".*timed out.*"):
         ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
-    # figure out initial LSN.
-    ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
-    init_lsn = ps_timeline_detail["last_record_lsn"]
-    log.info(f"initial LSN: {init_lsn}")
-    # sk timeline creation request expects minor version
-    pg_version = ps_timeline_detail["pg_version"] * 10000
     # create inital mconf
-    sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
-    mconf = Configuration(generation=1, members=sk_ids, new_members=None)
-    create_r = TimelineCreateRequest(
-        tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+    mconf = MembershipConfiguration(
+        generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None
     )
-    log.info(f"sending timeline create: {create_r.to_json()}")
-
-    for sk_http_cli in http_clis:
-        sk_http_cli.timeline_create(create_r)
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers)
     # Once timeline created endpoint should start.
     ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
     ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index b7c7478e78..c5dd34f64f 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
     Safekeeper,
 )
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.safekeeper.http import MembershipConfiguration
 from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
@@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_concurrent_computes(env))
 
 
+async def assert_query_hangs(endpoint: Endpoint, query: str):
+    """
+    Start on endpoint query which is expected to hang and check that it does.
+    """
+    conn = await endpoint.connect_async()
+    bg_query = asyncio.create_task(conn.execute(query))
+    await asyncio.sleep(2)
+    assert not bg_query.done()
+    return bg_query
+
+
 # Stop safekeeper and check that query cannot be executed while safekeeper is down.
 # Query will insert a single row into a table.
-async def check_unavailability(
-    sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2
-):
+async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2):
     # shutdown one of two acceptors, that is, majority
     sk.stop()
 
-    bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')"))
-
-    await asyncio.sleep(start_delay_sec)
-    # ensure that the query has not been executed yet
-    assert not bg_query.done()
-
+    bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')")
     # start safekeeper and await the query
     sk.start()
     await bg_query
@@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint):
     await conn.execute("INSERT INTO t values (1, 'payload')")
 
     # stop safekeeper and check that query cannot be executed while safekeeper is down
-    await check_unavailability(env.safekeepers[0], conn, 2)
+    await check_unavailability(env.safekeepers[0], endpoint, 2)
 
     # for the world's balance, do the same with second safekeeper
-    await check_unavailability(env.safekeepers[1], conn, 3)
+    await check_unavailability(env.safekeepers[1], endpoint, 3)
 
     # check that we can execute queries after restart
     await conn.execute("INSERT INTO t values (4, 'payload')")
@@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv):
     # insert with only one safekeeper up to create tail of flushed but not committed WAL
     sk1.stop()
     sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 2000), 'payload'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'")
     # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
     ep.stop_and_destroy()
 
@@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
     # insert with only one sk3 up to create tail of flushed but not committed WAL on it
     sk1.stop()
     sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'")
     # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
     ep.stop_and_destroy()
 
@@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi
     asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
 
 
+async def quorum_sanity_single(
+    env: NeonEnv,
+    compute_sks_ids: list[int],
+    members_sks_ids: list[int],
+    new_members_sks_ids: list[int] | None,
+    sks_to_stop_ids: list[int],
+    should_work_when_stopped: bool,
+):
+    """
+    *_ids params contain safekeeper node ids; it is assumed they are issued
+    from 1 and sequentially assigned to env.safekeepers.
+    """
+    members_sks = [env.safekeepers[i - 1] for i in members_sks_ids]
+    new_members_sks = (
+        [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None
+    )
+    sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids]
+
+    mconf = MembershipConfiguration(
+        generation=1,
+        members=Safekeeper.sks_to_safekeeper_ids(members_sks),
+        new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None,
+    )
+    members_sks = Safekeeper.mconf_sks(env, mconf)
+
+    tenant_id = env.initial_tenant
+    compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids])
+    members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members])
+    new_members_sks_ids_str = "-".join(
+        [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else []
+    )
+    sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop])
+    log.info(
+        f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}"
+    )
+    branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}"
+    timeline_id = env.create_branch(branch_name)
+
+    # create timeline on `members_sks`
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks)
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    ep = env.endpoints.create(branch_name, config_lines=config_lines)
+    ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids)
+    ep.safe_psql("create table t(key int, value text)")
+
+    # stop specified sks and check whether writes work
+    for sk in sks_to_stop:
+        sk.stop()
+    if should_work_when_stopped:
+        log.info("checking that writes still work")
+        ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'")
+        # restarting ep should also be fine
+        ep.stop()
+        ep.start()
+        ep.safe_psql("insert into t select generate_series(1, 100), 'plum'")
+        bg_query = None
+    else:
+        log.info("checking that writes hang")
+        bg_query = await assert_query_hangs(
+            ep, "insert into t select generate_series(1, 100), 'Papaya'"
+        )
+    # start again; now they should work
+    for sk in sks_to_stop:
+        sk.start()
+    if bg_query:
+        log.info("awaiting query")
+        await bg_query
+
+
+# It's a bit tempting to iterate over all possible combinations, but let's stick
+# with this for now.
+async def run_quorum_sanity(env: NeonEnv):
+    # 3 members, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True)
+    # 3 members, 1/3 up, should not work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False)
+
+    # 3 members, all up, should work; wp redundantly talks to 4th.
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False)
+
+    # joint conf of 1-2-3 and 4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True)
+    # joint conf of 1-2-3 and 4, 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False)
+
+    # joint conf of 1-2-3 and 2-3-4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True)
+    # joint conf of 1-2-3 and 2-3-4, 2 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True)
+    # joint conf of 1-2-3 and 2-3-4, 3 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False)
+    # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False)
+
+    # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only.
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # with 1 down should still be ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True)
+    # but with 2 down not ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False)
+
+
+# Test various combinations of membership configurations / neon.safekeepers
+# (list of safekeepers endpoint connects to) values / up & down safekeepers and
+# check that endpont can start and write data when we have quorum and can't when
+# we don't.
+def test_quorum_sanity(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 4
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_quorum_sanity(env))
+
+
 async def run_segment_init_failure(env: NeonEnv):
     env.create_branch("test_segment_init_failure")
     ep = env.endpoints.create_start("test_segment_init_failure")

From 0122d97f95b7f1ca236947e55961a307085853c2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Apr 2025 12:07:16 +0200
Subject: [PATCH 100/140] test_runner: only use last gen in
 `test_location_conf_churn` (#11511)

## Problem

`test_location_conf_churn` performs random location updates on
Pageservers. While doing this, it could instruct the compute to connect
to a stale generation and execute queries. This is invalid, and will
fail if a newer generation has removed layer files used by the stale
generation.

Resolves #11348.

## Summary of changes

Only connect to the latest generation when executing queries.
---
 test_runner/regress/test_pageserver_secondary.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index d03d05d33d..d48e731394 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
             pageserver.tenant_location_configure(tenant_id, location_conf)
             last_state[pageserver.id] = (mode, generation)
 
-            if mode.startswith("Attached"):
+            # It's only valid to connect to the last generation. Newer generations may yank layer
+            # files used in older generations.
+            last_generation = max(
+                [s[1] for s in last_state.values() if s[1] is not None], default=None
+            )
+
+            if mode.startswith("Attached") and generation == last_generation:
                 # This is a basic test: we are validating that he endpoint works properly _between_
                 # configuration changes.  A stronger test would be to validate that clients see
                 # no errors while we are making the changes.

From 50631512710d8c5fd9c4c681d7882e16f4df93f3 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 10 Apr 2025 14:04:18 +0100
Subject: [PATCH 101/140] compute: Add more neon ids to compute (#11366)

Pass more neon ids to compute_ctl.
Expose them to postgres as neon extension GUCs:
neon.project_id, neon.branch_id, neon.endpoint_id.


This is the compute side PR, not yet supported by cplane.
---
 compute_tools/src/compute.rs  |  5 ++++-
 compute_tools/src/config.rs   |  9 +++++++++
 control_plane/src/endpoint.rs |  3 +++
 libs/compute_api/src/spec.rs  |  6 ++++++
 pgxn/neon/libpagestore.c      | 28 ++++++++++++++++++++++++++++
 5 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index ad8925e7ab..457ace85d1 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -523,11 +523,14 @@ impl ComputeNode {
 
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
             pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
             pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
             pspec.tenant_id,
             pspec.timeline_id,
+            pspec.spec.project_id.as_deref().unwrap_or("None"),
+            pspec.spec.branch_id.as_deref().unwrap_or("None"),
+            pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
             pspec.spec.features,
             pspec.spec.remote_extensions,
         );
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 92939f816c..0eb8912b45 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
             escape_conf_value(&s.to_string())
         )?;
     }
+    if let Some(s) = &spec.project_id {
+        writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.branch_id {
+        writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.endpoint_id {
+        writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
+    }
 
     // tls
     if let Some(tls_config) = tls_config {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 3137bde161..663c024953 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -658,6 +658,9 @@ impl Endpoint {
             delta_operations: None,
             tenant_id: Some(self.tenant_id),
             timeline_id: Some(self.timeline_id),
+            project_id: None,
+            branch_id: None,
+            endpoint_id: Some(self.endpoint_id.clone()),
             mode: self.mode,
             pageserver_connstring: Some(pageserver_connstring),
             safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 994a665a88..974159ac72 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
     pub timeline_id: Option<TimelineId>,
     pub pageserver_connstring: Option<String>,
 
+    // More neon ids that we expose to the compute_ctl
+    // and to postgres as neon extension GUCs.
+    pub project_id: Option<String>,
+    pub branch_id: Option<String>,
+    pub endpoint_id: Option<String>,
+
     /// Safekeeper membership config generation. It is put in
     /// neon.safekeepers GUC and serves two purposes:
     /// 1) Non zero value forces walproposer to use membership configurations.
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 9ea708f29a..dfabb6919e 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
+char	   *neon_project_id;
+char	   *neon_branch_id;
+char	   *neon_endpoint_id;
 int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);
 
+	DefineCustomStringVariable("neon.project_id",
+							   "Neon project_id the server is running on",
+							   NULL,
+							   &neon_project_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.branch_id",
+							   "Neon branch_id the server is running on",
+							   NULL,
+							   &neon_branch_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.endpoint_id",
+							   "Neon endpoint_id the server is running on",
+							   NULL,
+							   &neon_endpoint_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,

From 2e35f23085263cfb4b5dc8158086621033637163 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 10 Apr 2025 16:24:30 +0200
Subject: [PATCH 102/140] tests: remove ignored `fair` field (#11521)

Pageserver has been ignoring field
`tenant_config.timeline_get_throttle.fair`
for many monhts, since we removed it from the config struct in
neondatabase/neon#8539.

Refs
- epic https://github.com/neondatabase/cloud/issues/27320
---
 .../test_pageserver_getpage_throttle.py       | 62 -------------------
 1 file changed, 62 deletions(-)

diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 3d7204d883..bc186477e1 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 import json
 import uuid
 from typing import TYPE_CHECKING
@@ -44,7 +43,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
                 "refill_interval": "100ms",
                 "refill_amount": int(rate_limit_rps / 10),
                 "max": int(rate_limit_rps / 10),
-                "fair": True,
             },
         },
     )
@@ -139,63 +137,3 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
         "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
     )
-
-
-throttle_config_with_field_fair_set = {
-    "task_kinds": ["PageRequestHandler"],
-    "fair": True,
-    "initial": 27,
-    "refill_interval": "43s",
-    "refill_amount": 23,
-    "max": 42,
-}
-
-
-def assert_throttle_config_with_field_fair_set(conf):
-    """
-    Field `fair` is ignored, so, responses don't contain it
-    """
-    without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
-    without_fair.pop("fair")
-
-    assert conf == without_fair
-
-
-def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-    env = neon_env_builder.init_start()
-    vps_http = env.storage_controller.pageserver_api()
-    # with_fair config should still be settable
-    vps_http.set_tenant_config(
-        env.initial_tenant,
-        {"timeline_get_throttle": throttle_config_with_field_fair_set},
-    )
-    conf = vps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-    assert_throttle_config_with_field_fair_set(
-        conf.tenant_specific_overrides["timeline_get_throttle"]
-    )
-
-
-def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-
-    def set_tenant_config(ps_cfg):
-        tenant_config = ps_cfg.setdefault("tenant_config", {})
-        tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
-
-    neon_env_builder.pageserver_config_override = set_tenant_config
-    env = neon_env_builder.init_start()
-    ps_http = env.pageserver.http_client()
-    conf = ps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-
-    env.pageserver.allowed_errors.append(
-        r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
-    )

From f06d721a989903d6e867629c0b2216a310419af2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 10 Apr 2025 10:53:37 -0400
Subject: [PATCH 103/140] test(pageserver): ensure gc-compaction does not fire
 critical errors (#11513)

## Problem

Part of https://github.com/neondatabase/neon/issues/10395

## Summary of changes

Add a test case to ensure gc-compaction doesn't fire any critical errors
if the key history is invalid due to partial GC.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs | 93 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d3623fc3b9..ad4a0d804d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11571,6 +11571,99 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x24),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                // This record will fail to redo
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![], // in-memory layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x20)..Lsn(0x30),
+                    delta1,
+                )], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .applied_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Compaction will fail, but should not fire any critical error.
+        // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
+        // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
+        // compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
+        let res = tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: None,
+                    compact_lsn_range: None,
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await;
+        assert!(res.is_err());
+
+        Ok(())
+    }
+
     #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {

From 5487a20b7224f81f50be62558a1037a254c4caf7 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 10 Apr 2025 16:28:28 +0100
Subject: [PATCH 104/140] compute: Set log_parameter=off for audit logging.
 (#11500)

Log -> Base,
pgaudit.log = 'ddl', pgaudit.log_parameter='off'

Hipaa -> Extended.
pgaudit.log = 'all, -misc', pgaudit.log_parameter='off'

add new level Full:
pgaudit.log='all', pgaudit.log_parameter='on'

Keep old parameter names for compatibility,
until cplane side changes are implemented and released.

closes https://github.com/neondatabase/cloud/issues/27202
---
 compute_tools/src/compute.rs    | 28 ++++++++++++++++------------
 compute_tools/src/config.rs     | 18 +++++++++++-------
 compute_tools/src/spec_apply.rs |  4 ++--
 libs/compute_api/src/spec.rs    | 23 ++++++++++++++---------
 4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 457ace85d1..06d5bbb9c5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -634,19 +634,23 @@ impl ComputeNode {
             });
         }
 
-        // Configure and start rsyslog for HIPAA if necessary
-        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
-            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-            if remote_endpoint.is_empty() {
-                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+        // Configure and start rsyslog for compliance audit logging
+        match pspec.spec.audit_log_level {
+            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
+                let remote_endpoint =
+                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+                if remote_endpoint.is_empty() {
+                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+                }
+
+                let log_directory_path = Path::new(&self.params.pgdata).join("log");
+                let log_directory_path = log_directory_path.to_string_lossy().to_string();
+                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Launch a background task to clean up the audit logs
+                launch_pgaudit_gc(log_directory_path);
             }
-
-            let log_directory_path = Path::new(&self.params.pgdata).join("log");
-            let log_directory_path = log_directory_path.to_string_lossy().to_string();
-            configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
-
-            // Launch a background task to clean up the audit logs
-            launch_pgaudit_gc(log_directory_path);
+            _ => {}
         }
 
         // Configure and start rsyslog for Postgres logs export
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 0eb8912b45..71c6123c3b 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -178,7 +178,7 @@ pub fn write_postgres_conf(
     // and don't allow the user or the control plane admin to change them.
     match spec.audit_log_level {
         ComputeAudit::Disabled => {}
-        ComputeAudit::Log => {
+        ComputeAudit::Log | ComputeAudit::Base => {
             writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
             writeln!(file, "pgaudit.log='ddl,role'")?;
             // Disable logging of catalog queries to reduce the noise
@@ -202,16 +202,20 @@ pub fn write_postgres_conf(
             }
             writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
         }
-        ComputeAudit::Hipaa => {
+        ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
             writeln!(
                 file,
                 "# Managed by compute_ctl compliance audit settings: begin"
             )?;
-            // This log level is very verbose
-            // but this is necessary for HIPAA compliance.
-            // Exclude 'misc' category, because it doesn't contain anythig relevant.
-            writeln!(file, "pgaudit.log='all, -misc'")?;
-            writeln!(file, "pgaudit.log_parameter=on")?;
+            // Enable logging of parameters.
+            // This is very verbose and may contain sensitive data.
+            if spec.audit_log_level == ComputeAudit::Full {
+                writeln!(file, "pgaudit.log_parameter=on")?;
+                writeln!(file, "pgaudit.log='all'")?;
+            } else {
+                writeln!(file, "pgaudit.log_parameter=off")?;
+                writeln!(file, "pgaudit.log='all, -misc'")?;
+            }
             // Disable logging of catalog queries
             // The catalog doesn't contain sensitive data, so we don't need to audit it.
             writeln!(file, "pgaudit.log_catalog=off")?;
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index e7d67f6ac5..0d1389dbad 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -278,12 +278,12 @@ impl ComputeNode {
             // so that all config operations are audit logged.
             match spec.audit_log_level
             {
-                ComputeAudit::Hipaa => {
+                ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
                     phases.push(CreatePgauditExtension);
                     phases.push(CreatePgauditlogtofileExtension);
                     phases.push(DisablePostgresDBPgAudit);
                 }
-                ComputeAudit::Log => {
+                ComputeAudit::Log | ComputeAudit::Base => {
                     phases.push(CreatePgauditExtension);
                     phases.push(DisablePostgresDBPgAudit);
                 }
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 974159ac72..82950bcbaa 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -165,13 +165,7 @@ pub struct ComputeSpec {
     #[serde(default)] // Default false
     pub drop_subscriptions_before_start: bool,
 
-    /// Log level for audit logging:
-    ///
-    /// Disabled - no audit logging. This is the default.
-    /// log - log masked statements to the postgres log using pgaudit extension
-    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
-    ///
-    /// Extensions should be present in shared_preload_libraries
+    /// Log level for compute audit logging
     #[serde(default)]
     pub audit_log_level: ComputeAudit,
 
@@ -295,14 +289,25 @@ impl ComputeMode {
 }
 
 /// Log level for audit logging
-/// Disabled, log, hipaa
-/// Default is Disabled
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
     #[default]
     Disabled,
+    // Deprecated, use Base instead
     Log,
+    // (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
+    // logged to the standard postgresql log stream
+    Base,
+    // Deprecated, use Full or Extended instead
     Hipaa,
+    // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access
+    Extended,
+    // (pgaudit.log='all', pgaudit.log_parameter='on'),
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access.
+    Full,
 }
 
 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]

From 52dee408dce3676dceb4d01c475d5e693bec2c6f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 10 Apr 2025 17:55:37 +0100
Subject: [PATCH 105/140] storage controller: improve safety of shard splits
 coinciding with controller restarts (#11412)

## Problem

The graceful leadership transfer process involves calling step_down on
the old controller, but this was not waiting for shard splits to
complete, and the new controller could therefore end up trying to abort
a shard split while it was still going on.

We mitigated this already in #11256 by avoiding the case where shard
split completion would update the database incorrectly, but this was a
fragile fix because it assumes that is the only problematic part of the
split running concurrently.

Precursors:
- #11290
- #11256

Closes: #11254

## Summary of changes

- Hold the reconciler gate from shard splits, so that step_down will
wait for them. Splits should always be fairly prompt, so it is okay to
wait here.
- Defense in depth: if step_down times out (hardcoded 10 second limit),
then fully terminate the controller process rather than letting it
continue running, potentially doing split-brainy things. This makes
sense because the new controller will always declare itself leader
unilaterally if step_down fails, so leaving an old controller running is
not beneficial.
- Tests: extend
`test_storage_controller_leadership_transfer_during_split` to separately
exercise the case of a split holding up step_down, and the case where
the overall timeout on step_down is hit and the controller terminates.
---
 storage_controller/src/http.rs                |  14 ++-
 storage_controller/src/service.rs             |  51 +++++++--
 .../regress/test_storage_controller.py        | 107 ++++++++++++------
 3 files changed, 125 insertions(+), 47 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 0d1dc8f8ee..4f3613b687 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1235,8 +1235,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
         ForwardOutcome::NotForwarded(req) => req,
     };
 
-    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.step_down().await)
+    // Spawn a background task: once we start stepping down, we must finish: if the client drops
+    // their request we should avoid stopping in some part-stepped-down state.
+    let handle = tokio::spawn(async move {
+        let state = get_state(&req);
+        state.service.step_down().await
+    });
+
+    let result = handle
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    json_response(StatusCode::OK, result)
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2ef09cd2e3..4790f80162 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::sync::gate::Gate;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};
 
 use crate::background_node_operations::{
@@ -594,6 +594,8 @@ struct TenantShardSplitAbort {
     new_stripe_size: Option<ShardStripeSize>,
     /// Until this abort op is complete, no other operations may be done on the tenant
     _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    /// The reconciler gate for the duration of the split operation, and any included abort.
+    _gate: GateGuard,
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -1460,7 +1462,7 @@ impl Service {
             // Retry until shutdown: we must keep this request object alive until it is properly
             // processed, as it holds a lock guard that prevents other operations trying to do things
             // to the tenant while it is in a weird part-split state.
-            while !self.cancel.is_cancelled() {
+            while !self.reconcilers_cancel.is_cancelled() {
                 match self.abort_tenant_shard_split(&op).await {
                     Ok(_) => break,
                     Err(e) => {
@@ -1473,9 +1475,12 @@ impl Service {
                         // when we retry, so that the abort op will succeed.  If the abort op is failing
                         // for some other reason, we will keep retrying forever, or until a human notices
                         // and does something about it (either fixing a pageserver or restarting the controller).
-                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
-                            .await
-                            .ok();
+                        tokio::time::timeout(
+                            Duration::from_secs(5),
+                            self.reconcilers_cancel.cancelled(),
+                        )
+                        .await
+                        .ok();
                     }
                 }
             }
@@ -4910,7 +4915,7 @@ impl Service {
                     1,
                     10,
                     Duration::from_secs(5),
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                 )
                 .await
             {
@@ -5161,6 +5166,11 @@ impl Service {
         )
         .await;
 
+        let _gate = self
+            .reconcilers_gate
+            .enter()
+            .map_err(|_| ApiError::ShuttingDown)?;
+
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
 
@@ -5188,6 +5198,7 @@ impl Service {
                         new_shard_count,
                         new_stripe_size,
                         _tenant_lock,
+                        _gate,
                     })
                     // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                     .ok();
@@ -5527,7 +5538,10 @@ impl Service {
                 "failpoint".to_string()
             )));
 
-            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+            failpoint_support::sleep_millis_async!(
+                "shard-split-post-remote-sleep",
+                &self.reconcilers_cancel
+            );
 
             tracing::info!(
                 "Split {} into {}",
@@ -5585,7 +5599,7 @@ impl Service {
                         stripe_size,
                         preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
                     },
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                 )
                 .await
             {
@@ -8670,9 +8684,24 @@ impl Service {
         failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
 
         self.inner.write().unwrap().step_down();
-        // TODO: would it make sense to have a time-out for this?
-        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
-            .await;
+
+        // Wait for reconciliations to stop, or terminate this process if they
+        // fail to stop in time (this indicates a bug in shutdown)
+        tokio::select! {
+            _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
+                tracing::info!("Reconciliations stopped, proceeding with step down");
+            }
+            _ = async {
+                failpoint_support::sleep_millis_async!("step-down-delay-timeout");
+                tokio::time::sleep(Duration::from_secs(10)).await
+            } => {
+                tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
+
+                // The caller may proceed to act as leader when it sees this request fail: reduce the chance
+                // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
+                std::process::exit(1);
+            }
+        }
 
         let mut global_observed = GlobalObservedState::default();
         let locked = self.inner.read().unwrap();
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index ce73c9a738..b2c8415e9a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer(
         )
 
 
+@pytest.mark.parametrize("step_down_times_out", [False, True])
 def test_storage_controller_leadership_transfer_during_split(
     neon_env_builder: NeonEnvBuilder,
     storage_controller_proxy: StorageControllerProxy,
     port_distributor: PortDistributor,
+    step_down_times_out: bool,
 ):
     """
     Exercise a race between shard splitting and graceful leadership transfer.  This is
@@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
         )
     env.storage_controller.reconcile_until_idle()
 
+    # We are testing scenarios where the step down API does not complete: either because it is stuck
+    # doing a shard split, or because it totally times out on some other failpoint.
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*step_down.*request was dropped before completing.*",
+            ".*step_down.*operation timed out.*",
+            ".*Send step down request failed, will retry.*",
+            ".*Send step down request still failed after.*retries.*",
+            ".*Leader .+ did not respond to step-down request.*",
+        ]
+    )
+
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
         # Start a shard split
         env.storage_controller.allowed_errors.extend(
@@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
         )
         pause_failpoint = "shard-split-pre-complete"
         env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
+
+        if not step_down_times_out:
+            # Prevent the timeout self-terminate code from executing: we will block step down on the
+            # shard split itself
+            env.storage_controller.configure_failpoints(
+                ("step-down-delay-timeout", "return(3600000)")
+            )
+
         split_fut = executor.submit(
             env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
         )
@@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split(
             timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
         )
 
+        if step_down_times_out:
+            # Step down will time out, original controller will terminate itself
+            env.storage_controller.allowed_errors.extend([".*terminating process.*"])
+        else:
+            # Step down does not time out: original controller hits its shard split completion
+            # code path and realises that it must not purge the parent shards from the database.
+            env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
+
         def passed_split_abort():
             try:
                 log.info("Checking log for pattern...")
-                assert env.storage_controller.log_contains(
-                    ".*Using observed state received from leader.*"
-                )
+                # This log is indicative of entering startup_reconcile, which happens
+                # after the point we would abort shard splits
+                assert env.storage_controller.log_contains(".*Populating tenant shards.*")
             except Exception:
                 log.exception("Failed to find pattern in log")
                 raise
@@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split(
         wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
         assert env.storage_controller.log_contains(".*Aborting shard split.*")
 
-        # Proxy is still talking to original controller here: disable its pause failpoint so
-        # that its shard split can run to completion.
-        log.info("Disabling failpoint")
-        # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
-        # on handling the shard split request.
-        env.storage_controller.request(
-            "PUT",
-            f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
-            json=[{"name": "shard-split-pre-complete", "actions": "off"}],
-            headers=env.storage_controller.headers(TokenScope.ADMIN),
-        )
+        if step_down_times_out:
+            # We will let the old controller hit a timeout path where it terminates itself, rather than
+            # completing step_down and trying to complete a shard split
+            def old_controller_terminated():
+                assert env.storage_controller.log_contains(".*terminating process.*")
 
-        def previous_stepped_down():
-            assert (
-                env.storage_controller.get_leadership_status()
-                == StorageControllerLeadershipStatus.STEPPED_DOWN
+            wait_until(old_controller_terminated)
+        else:
+            # Proxy is still talking to original controller here: disable its pause failpoint so
+            # that its shard split can run to completion.
+            log.info("Disabling failpoint")
+            # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
+            # on handling the shard split request.
+            env.storage_controller.request(
+                "PUT",
+                f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
+                json=[{"name": "shard-split-pre-complete", "actions": "off"}],
+                headers=env.storage_controller.headers(TokenScope.ADMIN),
             )
 
-        log.info("Awaiting step down")
-        wait_until(previous_stepped_down)
+            def previous_stepped_down():
+                assert (
+                    env.storage_controller.get_leadership_status()
+                    == StorageControllerLeadershipStatus.STEPPED_DOWN
+                )
 
-        # Let the shard split complete: this may happen _after_ the replacement has come up
-        # and tried to clean up the databases
-        log.info("Unblocking & awaiting shard split")
-        with pytest.raises(Exception, match="Unexpected child shard count"):
-            # This split fails when it tries to persist results, because it encounters
-            # changes already made by the new controller's abort-on-startup
-            split_fut.result()
+            log.info("Awaiting step down")
+            wait_until(previous_stepped_down)
+
+            # Let the shard split complete: this may happen _after_ the replacement has come up
+            # and tried to clean up the databases
+            log.info("Unblocking & awaiting shard split")
+            with pytest.raises(Exception, match="Unexpected child shard count"):
+                # This split fails when it tries to persist results, because it encounters
+                # changes already made by the new controller's abort-on-startup
+                split_fut.result()
 
         log.info("Routing to new leader")
         storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split(
     env.storage_controller.wait_until_ready()
     env.storage_controller.consistency_check()
 
-    # Check that the stepped down instance forwards requests
-    # to the new leader while it's still running.
-    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-    env.storage_controller.tenant_shard_dump()
-    env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
-    status = env.storage_controller.node_status(env.pageservers[0].id)
-    assert status["scheduling"] == "Pause"
+    if not step_down_times_out:
+        # Check that the stepped down instance forwards requests
+        # to the new leader while it's still running.
+        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+        env.storage_controller.tenant_shard_dump()
+        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+        status = env.storage_controller.node_status(env.pageservers[0].id)
+        assert status["scheduling"] == "Pause"
 
 
 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):

From 9c37bfc90abfb7aa653d8f3892d227ed16018492 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 10 Apr 2025 18:03:22 +0100
Subject: [PATCH 106/140] pageserver/tests: make image_layer_rewrite write less
 data (#11525)

## Problem

This test is slow to execute, particularly if you're on a slow
environment like vscode in a browser. Might have got much slower when we
switched to direct IO?

## Summary of changes

- Reduce the scale of the test by 10x, since there was nothing special
about the original size.
---
 .../src/tenant/storage_layer/image_layer.rs    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 3243b73942..72992e5031 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1192,7 +1192,7 @@ mod test {
 
         // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
         let range = input_start..input_end;
 
         // Build an image layer to filter
@@ -1235,7 +1235,7 @@ mod test {
             let shard_identity = ShardIdentity::new(
                 ShardNumber(shard_number),
                 shard_count,
-                ShardStripeSize(0x8000),
+                ShardStripeSize(0x800),
             )
             .unwrap();
             let harness = TenantHarness::create_custom(
@@ -1287,12 +1287,12 @@ mod test {
 
             // This exact size and those below will need updating as/when the layer encoding changes, but
             // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
+            assert_eq!(original_size, 122880);
 
             match shard_number {
                 0 => {
                     // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                     let replacement = replacement.unwrap();
 
                     // We should have dropped some of the data
@@ -1300,7 +1300,7 @@ mod test {
                     assert!(replacement.metadata().file_size > 0);
 
                     // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                 }
                 1 => {
                     // Shard 1 has no keys in our input range
@@ -1309,19 +1309,19 @@ mod test {
                 }
                 2 => {
                     // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                     let replacement = replacement.unwrap();
                     assert!(replacement.metadata().file_size < original_size);
                     assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                 }
                 3 => {
                     // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
+                    assert_eq!(wrote_keys, 0x1000);
                     let replacement = replacement.unwrap();
                     assert!(replacement.metadata().file_size < original_size);
                     assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
+                    assert_eq!(replacement.metadata().file_size, 73728);
                 }
                 _ => unreachable!(),
             }

From 342607473ad4afcee7e199de8ee2c133c23b73b9 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 10 Apr 2025 14:55:51 -0500
Subject: [PATCH 107/140] Make Endpoint::respec_deep() infinitely deep (#11527)

Because it wasn't recursive, there was a limit to the depth of updates.
This work is necessary because as we teach neon_local and compute_ctl
that the content in --spec-path should match a similar structure we get
from the control plane, the spec object itself will no longer be
toplevel. It will be under the "spec" key.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ba8de1c01c..858d367abf 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@ import threading
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Mapping
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -4296,28 +4297,29 @@ class Endpoint(PgProtocol, LogUtils):
 
     def respec_deep(self, **kwargs: Any) -> None:
         """
-        Update the endpoint.json file taking into account nested keys.
-        It does one level deep update. Should enough for most cases.
-        Distinct method from respec() to do not break existing functionality.
+        Update the spec.json file taking into account nested keys.
+        Distinct method from respec() to not break existing functionality.
         NOTE: This method also updates the spec.json file, not endpoint.json.
         We need it because neon_local also writes to spec.json, so intended
         use-case is i) start endpoint with some config, ii) respec_deep(),
         iii) call reconfigure() to apply the changes.
         """
+
+        def update(curr, patch):
+            for k, v in patch.items():
+                if isinstance(v, Mapping):
+                    curr[k] = update(curr.get(k, {}), v)
+                else:
+                    curr[k] = v
+            return curr
+
         config_path = os.path.join(self.endpoint_path(), "spec.json")
         with open(config_path) as f:
             data_dict: dict[str, Any] = json.load(f)
 
         log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
 
-        for key, value in kwargs.items():
-            if isinstance(value, dict):
-                if key not in data_dict:
-                    data_dict[key] = value
-                else:
-                    data_dict[key] = {**data_dict[key], **value}
-            else:
-                data_dict[key] = value
+        update(data_dict, kwargs)
 
         with open(config_path, "w") as file:
             log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))

From 4c4e33bc2e70cb56fa796c380f58631ec6083fe6 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Fri, 11 Apr 2025 10:11:35 +0400
Subject: [PATCH 108/140] storage: add http/https server and cert resover
 metrics (#11450)

## Problem
We need to export some metrics about certs/connections to configure
alerts and make sure that all HTTP requests are gone before turning
https-only mode on.
- Closes: https://github.com/neondatabase/cloud/issues/25526

## Summary of changes
- Add started connection and connection error metrics to http/https
Server.
- Add certificate expiration time and reload metrics to
ReloadingCertificateResolver.
---
 Cargo.lock                       |   1 +
 libs/http-utils/Cargo.toml       |   1 +
 libs/http-utils/src/server.rs    |  43 +++++++++++++
 libs/http-utils/src/tls_certs.rs | 106 ++++++++++++++++++++++++++++---
 pageserver/src/bin/pageserver.rs |   1 +
 safekeeper/src/http/mod.rs       |   1 +
 storage_controller/src/main.rs   |   1 +
 test_runner/regress/test_ssl.py  |  61 ++++++++++++++++++
 8 files changed, 207 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index aea8924f4f..5d2cdcea27 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2837,6 +2837,7 @@ dependencies = [
  "utils",
  "uuid",
  "workspace_hack",
+ "x509-cert",
 ]
 
 [[package]]
diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml
index 6d24ee352a..5f6578f76e 100644
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -30,6 +30,7 @@ tokio.workspace = true
 tracing.workspace = true
 url.workspace = true
 uuid.workspace = true
+x509-cert.workspace = true
 
 # to use tokio channels as streams, this is faster to compile than async_stream
 # why is it only here? no other crate should use it, streams are rarely needed.
diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs
index 07fd56ac01..f93f71c962 100644
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -4,6 +4,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use hyper0::Body;
 use hyper0::server::conn::Http;
+use metrics::{IntCounterVec, register_int_counter_vec};
+use once_cell::sync::Lazy;
 use routerify::{RequestService, RequestServiceBuilder};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
     tls_acceptor: Option<TlsAcceptor>,
 }
 
+static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_started_total",
+        "Number of established http/https connections",
+        &["scheme"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_errors_total",
+        "Number of occured connection errors by type",
+        &["type"]
+    )
+    .expect("failed to define a metric")
+});
+
 impl Server {
     pub fn new(
         request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
             false
         }
 
+        let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
+        let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
+        let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
+        let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
+        let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
+
+        let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
+        let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
+
         let mut connections = FuturesUnordered::new();
         loop {
             tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
                     let (tcp_stream, remote_addr) = match stream {
                         Ok(stream) => stream,
                         Err(err) => {
+                            tcp_error_cnt.inc();
                             if !suppress_io_error(&err) {
                                 info!("Failed to accept TCP connection: {err:#}");
                             }
@@ -78,11 +108,18 @@ impl Server {
                     let tls_acceptor = self.tls_acceptor.clone();
                     let cancel = cancel.clone();
 
+                    let tls_error_cnt = tls_error_cnt.clone();
+                    let http_error_cnt = http_error_cnt.clone();
+                    let https_error_cnt = https_error_cnt.clone();
+                    let http_connection_cnt = http_connection_cnt.clone();
+                    let https_connection_cnt = https_connection_cnt.clone();
+
                     connections.push(tokio::spawn(
                         async move {
                             match tls_acceptor {
                                 Some(tls_acceptor) => {
                                     // Handle HTTPS connection.
+                                    https_connection_cnt.inc();
                                     let tls_stream = tokio::select! {
                                         tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
                                         _ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
                                     let tls_stream = match tls_stream {
                                         Ok(tls_stream) => tls_stream,
                                         Err(err) => {
+                                            tls_error_cnt.inc();
                                             if !suppress_io_error(&err) {
                                                 info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                             }
@@ -97,6 +135,7 @@ impl Server {
                                         }
                                     };
                                     if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
+                                        https_error_cnt.inc();
                                         if !suppress_hyper_error(&err) {
                                             info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                         }
@@ -104,7 +143,9 @@ impl Server {
                                 }
                                 None => {
                                     // Handle HTTP connection.
+                                    http_connection_cnt.inc();
                                     if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
+                                        http_error_cnt.inc();
                                         if !suppress_hyper_error(&err) {
                                             info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                         }
@@ -115,6 +156,7 @@ impl Server {
                  }
                 Some(conn) = connections.next() => {
                     if let Err(err) = conn {
+                        panic_error_cnt.inc();
                         error!("Connection panicked: {err:#}");
                     }
                 }
@@ -122,6 +164,7 @@ impl Server {
                     // Wait for graceful shutdown of all connections.
                     while let Some(conn) = connections.next().await {
                         if let Err(err) = conn {
+                            panic_error_cnt.inc();
                             error!("Connection panicked: {err:#}");
                         }
                     }
diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs
index 0c18d84d98..2799db78a6 100644
--- a/libs/http-utils/src/tls_certs.rs
+++ b/libs/http-utils/src/tls_certs.rs
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
 use anyhow::Context;
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
+use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
+use once_cell::sync::Lazy;
 use rustls::{
-    pki_types::{CertificateDer, PrivateKeyDer},
+    pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
     server::{ClientHello, ResolvesServerCert},
     sign::CertifiedKey,
 };
+use x509_cert::der::Reader;
 
 pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
     let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
     Ok(certified_key)
 }
 
+/// rustls's CertifiedKey with extra parsed fields used for metrics.
+struct ParsedCertifiedKey {
+    certified_key: CertifiedKey,
+    expiration_time: UnixTime,
+}
+
+/// Parse expiration time from an X509 certificate.
+fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
+    let parsed_cert = x509_cert::der::SliceReader::new(cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    Ok(UnixTime::since_unix_epoch(
+        parsed_cert
+            .tbs_certificate
+            .validity
+            .not_after
+            .to_unix_duration(),
+    ))
+}
+
+async fn load_and_parse_certified_key(
+    key_filename: &Utf8Path,
+    cert_filename: &Utf8Path,
+) -> anyhow::Result<ParsedCertifiedKey> {
+    let certified_key = load_certified_key(key_filename, cert_filename).await?;
+    let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
+    Ok(ParsedCertifiedKey {
+        certified_key,
+        expiration_time,
+    })
+}
+
+static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "tls_certs_expiration_time_seconds",
+        "Expiration time of the loaded certificate since unix epoch in seconds",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_started_total",
+        "Number of certificate reload loop iterations started",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_updated_total",
+        "Number of times the certificate was updated to the new one",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_failed_total",
+        "Number of times the certificate reload failed",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
 /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
 /// the disk periodically.
 #[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
 impl ReloadingCertificateResolver {
     /// Creates a new Resolver by loading certificate and private key from FS and
     /// creating tokio::task to reload them with provided reload_period.
+    /// resolver_name is used as metric's label.
     pub async fn new(
+        resolver_name: &str,
         key_filename: &Utf8Path,
         cert_filename: &Utf8Path,
         reload_period: Duration,
     ) -> anyhow::Result<Arc<Self>> {
+        // Create metrics for current resolver.
+        let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
+        let cert_reload_started_counter =
+            CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_updated_counter =
+            CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_failed_counter =
+            CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
+
+        let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
+
         let this = Arc::new(Self {
-            certified_key: ArcSwap::from_pointee(
-                load_certified_key(key_filename, cert_filename).await?,
-            ),
+            certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
         });
+        cert_expiration_time.set(parsed_key.expiration_time.as_secs());
 
         tokio::spawn({
             let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
                         Some(this) => this,
                         None => break, // Resolver has been destroyed, exit.
                     };
-                    match load_certified_key(&key_filename, &cert_filename).await {
-                        Ok(new_certified_key) => {
-                            if new_certified_key.cert == this.certified_key.load().cert {
+                    cert_reload_started_counter.inc();
+
+                    match load_and_parse_certified_key(&key_filename, &cert_filename).await {
+                        Ok(parsed_key) => {
+                            if parsed_key.certified_key.cert == this.certified_key.load().cert {
                                 tracing::debug!("Certificate has not changed since last reloading");
                             } else {
                                 tracing::info!("Certificate has been reloaded");
-                                this.certified_key.store(Arc::new(new_certified_key));
+                                this.certified_key.store(Arc::new(parsed_key.certified_key));
+                                cert_expiration_time.set(parsed_key.expiration_time.as_secs());
+                                cert_reload_updated_counter.inc();
                             }
                             last_reload_failed = false;
                         }
                         Err(err) => {
+                            cert_reload_failed_counter.inc();
                             // Note: Reloading certs may fail if it conflicts with the script updating
                             // the files at the same time. Warn only if the error is persistent.
                             if last_reload_failed {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2740f81758..250d4180f5 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -455,6 +455,7 @@ fn start_pageserver(
     let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
     {
         let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            "main",
             &conf.ssl_key_file,
             &conf.ssl_cert_file,
             conf.ssl_cert_reload_period,
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 003a75faa6..6e7c5d971d 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -31,6 +31,7 @@ pub async fn task_main_https(
     global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
     let cert_resolver = ReloadingCertificateResolver::new(
+        "main",
         &conf.ssl_key_file,
         &conf.ssl_cert_file,
         conf.ssl_cert_reload_period,
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 1aa9ae10ae..9358c9da4d 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> {
             let https_listener = tcp_listener::bind(https_addr)?;
 
             let resolver = ReloadingCertificateResolver::new(
+                "main",
                 &args.ssl_key_file,
                 &args.ssl_cert_file,
                 *args.ssl_cert_reload_period,
diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py
index 9a7204ca17..39c94c05a9 100644
--- a/test_runner/regress/test_ssl.py
+++ b/test_runner/regress/test_ssl.py
@@ -1,5 +1,6 @@
 import os
 import ssl
+from datetime import datetime, timedelta
 
 import pytest
 import requests
@@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder):
     requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
     cur_cert = ssl.get_server_certificate(("localhost", port))
     assert cur_cert == sk_cert
+
+
+def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder):
+    """
+    Test metrics exported from http/https server and tls cert reloader.
+    """
+    neon_env_builder.use_https_pageserver_api = True
+    neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'"
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*Error reloading certificate.*")
+
+    ps_client = env.pageserver.http_client()
+
+    # 1. Test connection started metric.
+    filter_https = {"scheme": "https"}
+    old_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+
+    addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status"
+    requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
+
+    new_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+    # The counter should increase after the request,
+    # but it may increase by more than one because of storcon requests.
+    assert new_https_conn_count > old_https_conn_count
+
+    # 2. Test tls connection error.
+    # Request without specified CA cert file should fail.
+    with pytest.raises(requests.exceptions.SSLError):
+        requests.get(addr)
+
+    tls_error_cnt = (
+        ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0
+    )
+    assert tls_error_cnt == 1
+
+    # 3. Test expiration time metric.
+    expiration_time = datetime.fromtimestamp(
+        ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0
+    )
+    now = datetime.now()
+    # neon_local generates certs valid for 100 years.
+    # Compare with +-1 year to not care about leap years.
+    assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101)
+
+    # 4. Test cert reload failed metric.
+    reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total")
+    assert reload_error_cnt == 0
+
+    os.remove(env.pageserver.workdir / "server.crt")
+
+    def reload_failed():
+        reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0
+        assert reload_error_cnt > 0
+
+    wait_until(reload_failed)

From 8884865bca564033b3e1632d088f21651f1a90be Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 11 Apr 2025 11:38:05 +0200
Subject: [PATCH 109/140] tests: make `test_pageserver_getpage_throttle` less
 flaky  (#11482)

# Refs

- fixes https://github.com/neondatabase/neon/issues/11395

# Problem

Since 2025-03-10, we have observed increased flakiness of
`test_pageserver_getpage_throttle`.

The test is timing-dependent by nature, and was hitting the

```
 assert duration_secs >= 10 * actual_smgr_query_seconds, (
        "smgr metrics should not include throttle wait time"
    )
```

quite frequently.

# Analysis

These failures are not reproducible.

In this PR's history is a commit that reran the test 100 times without
requiring a single retry.

In https://github.com/neondatabase/neon/issues/11395 there is a link to
a query to the test results database.
It shows that the flakiness was not constant, but rather episodic:
2025-03-{10,11,12,13} 2025-03-{19,20,21} 2025-03-31 and 2025-04-01.

To me, this suggests variability in available CPU.

# Solution

The point of the offending assertion is to ensure that most of the
request latency is spent on throttling, because testing of the
throttling mechanism is the point of the test.
The `10` magic number means at most 10% of mean latency may be spent on
request processing.

Ideally we would control the passage of time (virtual clock source) to
make this test deterministic.

But I don't see that happening in our regression test setup.

So, this PR de-flakes the test as follows:
- allot up to 66% of mean latency for request processing
- increase duration from 10s to 20s, hoping to get better protection
from momentary CPU spikes in noisy neighbor tests or VMs on the runner
host

As a drive-by, switch to `pytest.approx` and remove one self-test
assertion I can't make sense of anymore.
---
 .../test_pageserver_getpage_throttle.py       | 24 ++++++++-----------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index bc186477e1..5ef63e2fe9 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -15,7 +15,6 @@ if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
 def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env = neon_env_builder.init_start()
 
@@ -96,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
 
     log.info("run pagebench")
-    duration_secs = 10
+    duration_secs = 20
     actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
 
     log.info("validate the client is capped at the configured rps limit")
     expect_ncompleted = duration_secs * rate_limit_rps
-    delta_abs = abs(expect_ncompleted - actual_ncompleted)
-    threshold = 0.05 * expect_ncompleted
-    assert threshold / rate_limit_rps < 0.1 * duration_secs, (
-        "test self-test: unrealistic expecations regarding precision in this test"
-    )
-    assert delta_abs < 0.05 * expect_ncompleted, (
+    assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
         "the throttling deviates more than 5percent from the expectation"
     )
 
@@ -120,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         timeout=compaction_period,
     )
 
+    log.info("validate the metrics")
     smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
     throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -128,12 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
     actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
-    log.info("validate that the metric doesn't include throttle wait time")
-    assert duration_secs >= 10 * actual_smgr_query_seconds, (
-        "smgr metrics should not include throttle wait time"
+    assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
+        "throttling and processing latency = total request time; this assert validates thi holds on average"
     )
 
-    log.info("validate that the throttling wait time metrics is correct")
-    assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
-        "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
+    # without this assertion, the test would pass even if the throttling was completely broken
+    # but the request processing is so slow that it makes up for the latency that a correct throttling
+    # implementation would add
+    assert actual_smgr_query_seconds < 0.66 * duration_secs, (
+        "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
     )

From 979fa0682b09bcbac5d74d2f1082a12c173c676f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 11 Apr 2025 11:55:49 +0200
Subject: [PATCH 110/140] tests: update batching perf test workload to include
 scattered LSNs (#11391)

The batching perf test workload is currently read-only sequential scans.
However, realistic workloads have concurrent writes (to other pages)
going on.

This PR simulates concurrent writes to other pages by emitting logical
replication messages.

These degrade the achieved batching factor, for the reason see
- https://github.com/neondatabase/neon/issues/10765

PR
- https://github.com/neondatabase/neon/pull/11494

will fix this problem and get batching factor back up.

---------

Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py         |  3 +-
 .../pageserver/test_page_service_batching.py  | 88 +++++++++++++------
 2 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 858d367abf..75a0596f58 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2884,13 +2884,14 @@ class NeonPageserver(PgProtocol, LogUtils):
         self,
         immediate: bool = False,
         timeout_in_seconds: int | None = None,
+        extra_env_vars: dict[str, str] | None = None,
     ):
         """
         High level wrapper for restart: restarts the process, and waits for
         tenant state to stabilize.
         """
         self.stop(immediate=immediate)
-        self.start(timeout_in_seconds=timeout_in_seconds)
+        self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
         self.quiesce_tenants()
 
     def quiesce_tenants(self):
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 2c27368001..5169add6cb 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -1,5 +1,8 @@
+import concurrent.futures
 import dataclasses
 import json
+import re
+import threading
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -31,15 +34,15 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
     mode: str = "pipelined"
 
 
-EXECUTION = ["concurrent-futures", "tasks"]
+EXECUTION = ["concurrent-futures"]
 
 NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
     for execution in EXECUTION:
         NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
 
-BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 2, 4, 8, 16, 32]:
+BATCHABLE: list[PageServicePipeliningConfig] = []
+for max_batch_size in [32]:
     for execution in EXECUTION:
         BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
 
@@ -47,19 +50,6 @@ for max_batch_size in [1, 2, 4, 8, 16, 32]:
 @pytest.mark.parametrize(
     "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
     [
-        # non-batchable workloads
-        # (A separate benchmark will consider latency).
-        *[
-            (
-                50,
-                config,
-                TARGET_RUNTIME,
-                1,
-                128,
-                f"not batchable {dataclasses.asdict(config)}",
-            )
-            for config in NON_BATCHABLE
-        ],
         # batchable workloads should show throughput and CPU efficiency improvements
         *[
             (
@@ -137,7 +127,14 @@ def test_throughput(
 
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # minimal lfc & small shared buffers to force requests to pageserver
+            "neon.max_file_cache_size=1MB",
+            "shared_buffers=10MB",
+        ],
+    )
     conn = endpoint.connect()
     cur = conn.cursor()
 
@@ -155,7 +152,6 @@ def test_throughput(
     tablesize = tablesize_mib * 1024 * 1024
     npages = tablesize // (8 * 1024)
     cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
-    # TODO: can we force postgres to do sequential scans?
 
     #
     # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
@@ -211,31 +207,73 @@ def test_throughput(
                 ).value,
             )
 
-    def workload() -> Metrics:
+    def workload(disruptor_started: threading.Event) -> Metrics:
+        disruptor_started.wait()
         start = time.time()
         iters = 0
         while time.time() - start < target_runtime or iters < 2:
-            log.info("Seqscan %d", iters)
             if iters == 1:
                 # round zero for warming up
                 before = get_metrics()
-            cur.execute(
-                "select clear_buffer_cache()"
-            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
             cur.execute("select sum(data::bigint) from t")
             assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
             iters += 1
         after = get_metrics()
         return (after - before).normalize(iters - 1)
 
+    def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        iters = 0
+        while True:
+            cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
+            if stop_disruptor.is_set():
+                break
+            disruptor_started.set()
+            iters += 1
+            time.sleep(0.001)
+        return iters
+
     env.pageserver.patch_config_toml_nonrecursive(
         {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
     )
-    env.pageserver.restart()
-    metrics = workload()
+
+    # set trace for log analysis below
+    env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
+
+    log.info("Starting workload")
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        disruptor_started = threading.Event()
+        stop_disruptor = threading.Event()
+        disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
+        workload_fut = executor.submit(workload, disruptor_started)
+        metrics = workload_fut.result()
+        stop_disruptor.set()
+        ndisruptions = disruptor_fut.result()
+        log.info("Disruptor issued %d disrupting requests", ndisruptions)
 
     log.info("Results: %s", metrics)
 
+    since_last_start: list[str] = []
+    for line in env.pageserver.logfile.read_text().splitlines():
+        if "git:" in line:
+            since_last_start = []
+        since_last_start.append(line)
+
+    stopping_batching_because_re = re.compile(
+        r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)"
+    )
+    reasons_for_stopping_batching = {}
+    for line in since_last_start:
+        match = stopping_batching_because_re.search(line)
+        if match:
+            if match.group(1) not in reasons_for_stopping_batching:
+                reasons_for_stopping_batching[match.group(1)] = 0
+            reasons_for_stopping_batching[match.group(1)] += 1
+
+    log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching)
+
     #
     # Sanity-checks on the collected data
     #

From 3c8565a1941b3ad4cfb0c1de068d7e4bec7baf0a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 11 Apr 2025 13:31:12 +0200
Subject: [PATCH 111/140] test_runner: propagate config via `attach_hook` for
 test fix (#11529)

## Problem

The `pagebench` benchmarks set up an initial dataset by creating a
template tenant, copying the remote storage to a bunch of new tenants,
and attaching them to Pageservers.

In #11420, we found that
`test_pageserver_characterize_throughput_with_n_tenants` had degraded
performance because it set a custom tenant config in Pageservers that
was then replaced with the default tenant config by the storage
controller.

The initial fix was to register the tenants directly in the storage
controller, but this created the tenants with generation 1. This broke
`test_basebackup_with_high_slru_count`, where the template tenant was at
generation 2, leading to all layer files at generation 2 being ignored.

Resolves #11485.
Touches #11381.

## Summary of changes

This patch addresses both test issues by modifying `attach_hook` to also
take a custom tenant config. This allows attaching tenants to
Pageservers from pre-existing remote storage, specifying both the
generation and tenant config when registering them in the storage
controller.
---
 control_plane/src/storage_controller.rs       |  8 +++++--
 storage_controller/src/service.rs             | 24 ++++++++++---------
 test_runner/fixtures/neon_fixtures.py         | 12 ++++++----
 .../fixtures/pageserver/many_tenants.py       | 12 ++++------
 .../pagebench/test_large_slru_basebackup.py   | 12 ++++------
 5 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 8000576e87..a4b56ae5c0 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
     TenantCreateResponse, TenantLocateResponse,
 };
-use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{
+    TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
     pub node_id: Option<NodeId>,
-    pub generation_override: Option<i32>,
+    pub generation_override: Option<i32>, // only new tenants
+    pub config: Option<TenantConfig>,     // only new tenants
 }
 
 #[derive(Serialize, Deserialize)]
@@ -805,6 +808,7 @@ impl StorageController {
             tenant_shard_id,
             node_id: Some(pageserver_id),
             generation_override: None,
+            config: None,
         };
 
         let response = self
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4790f80162..0982e56155 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1852,6 +1852,7 @@ impl Service {
         };
 
         if insert {
+            let config = attach_req.config.clone().unwrap_or_default();
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
@@ -1860,7 +1861,7 @@ impl Service {
                 generation: attach_req.generation_override.or(Some(0)),
                 generation_pageserver: None,
                 placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
-                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                config: serde_json::to_string(&config).unwrap(),
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
@@ -1883,16 +1884,16 @@ impl Service {
                 Ok(()) => {
                     tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
 
-                    let mut locked = self.inner.write().unwrap();
-                    locked.tenants.insert(
+                    let mut shard = TenantShard::new(
                         attach_req.tenant_shard_id,
-                        TenantShard::new(
-                            attach_req.tenant_shard_id,
-                            ShardIdentity::unsharded(),
-                            PlacementPolicy::Attached(0),
-                            None,
-                        ),
+                        ShardIdentity::unsharded(),
+                        PlacementPolicy::Attached(0),
+                        None,
                     );
+                    shard.config = config;
+
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(attach_req.tenant_shard_id, shard);
                     tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
                 }
             }
@@ -1977,11 +1978,12 @@ impl Service {
             .set_attached(scheduler, attach_req.node_id);
 
         tracing::info!(
-            "attach_hook: tenant {} set generation {:?}, pageserver {}",
+            "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
             attach_req.tenant_shard_id,
             tenant_shard.generation,
             // TODO: this is an odd number of 0xf's
-            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
+            attach_req.config,
         );
 
         // Trick the reconciler into not doing anything for this tenant: this helps
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 75a0596f58..9d4068b583 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1986,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
         tenant_shard_id: TenantId | TenantShardId,
         pageserver_id: int,
         generation_override: int | None = None,
+        config: None | dict[str, Any] = None,
     ) -> int:
         body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
         if generation_override is not None:
             body["generation_override"] = generation_override
+        if config is not None:
+            body["config"] = config
 
         response = self.request(
             "POST",
@@ -2980,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils):
         to call into the pageserver HTTP client.
         """
         client = self.http_client()
-        if generation is None:
-            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        elif override_storage_controller_generation:
+        if generation is None or override_storage_controller_generation:
             generation = self.env.storage_controller.attach_hook_issue(
-                tenant_id, self.id, generation
+                tenant_id,
+                self.id,
+                generation_override=generation if override_storage_controller_generation else None,
+                config=config,
             )
         return client.tenant_attach(
             tenant_id,
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index eedb693e3d..71c750b9eb 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -65,13 +65,11 @@ def single_timeline(
     assert ps_http.tenant_list() == []
 
     def attach(tenant):
-        # NB: create the new tenant in the storage controller with the correct tenant config. This
-        # will pick up the existing tenant data from remote storage. If we just attach it to the
-        # Pageserver, the storage controller will reset the tenant config to the default.
-        env.create_tenant(
-            tenant_id=tenant,
-            timeline_id=template_timeline,
-            conf=template_config,
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config,
+            generation=100,
+            override_storage_controller_generation=True,
         )
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index efd423104d..8af52dcbd0 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count(
 
     n_txns = 500000
 
-    def setup_wrapper(env: NeonEnv):
-        return setup_tenant_template(env, n_txns)
-
     env = setup_pageserver_with_tenants(
-        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+        neon_env_builder,
+        f"large_slru_count-{n_tenants}-{n_txns}",
+        n_tenants,
+        lambda env: setup_tenant_template(env, n_txns),
     )
     run_benchmark(env, pg_bin, record, duration)
 
@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
         "gc_period": "0s",  # disable periodic gc
         "checkpoint_timeout": "10 years",
         "compaction_period": "0s",  # disable periodic compaction
-        "compaction_threshold": 10,
-        "compaction_target_size": 134217728,
-        "checkpoint_distance": 268435456,
-        "image_creation_threshold": 3,
     }
 
     template_tenant, template_timeline = env.create_tenant(set_default=True)

From a6937a3281e7431c46e31720d1e3337f96b867c8 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 11 Apr 2025 14:14:08 +0200
Subject: [PATCH 112/140] pageserver: improve shard ancestor compaction logging
 (#11535)

## Problem

Shard ancestor compaction always logs "starting shard ancestor
compaction", even if there is no work to do. This is very spammy (every
20 seconds for every shard). It also has limited progress logging.

## Summary of changes

* Only log "starting shard ancestor compaction" when there's work to do.
* Include details about the amount of work.
* Log progress messages for each layer, and when waiting for uploads.
* Log when compaction is completed, with elapsed duration and whether
there is more work for a later iteration.
---
 pageserver/src/tenant/timeline/compaction.rs | 52 +++++++++++++++-----
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7b1969f209..c6f0e32494 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1140,6 +1140,7 @@ impl Timeline {
     ) -> Result<(), CompactionError> {
         let mut drop_layers = Vec::new();
         let mut layers_to_rewrite: Vec<Layer> = Vec::new();
+        let mut rewrite_max_exceeded: bool = false;
 
         // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
         // layer is behind this Lsn, it indicates that the layer is being retained beyond the
@@ -1148,12 +1149,7 @@ impl Timeline {
         // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
         // are rewriting layers.
         let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
-
-        tracing::info!(
-            "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
-        );
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
 
         let layers = self.layers.read().await;
         for layer_desc in layers.layer_map()?.iter_historic_layers() {
@@ -1171,8 +1167,8 @@ impl Timeline {
                 // This ancestral layer only covers keys that belong to other shards.
                 // We include the full metadata in the log: if we had some critical bug that caused
                 // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
-                info!(%layer, old_metadata=?layer.metadata(),
-                    "dropping layer after shard split, contains no keys for this shard.",
+                debug!(%layer, old_metadata=?layer.metadata(),
+                    "dropping layer after shard split, contains no keys for this shard",
                 );
 
                 if cfg!(debug_assertions) {
@@ -1234,9 +1230,10 @@ impl Timeline {
             }
 
             if layers_to_rewrite.len() >= rewrite_max {
-                tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
+                debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
                     layers_to_rewrite.len()
                 );
+                rewrite_max_exceeded = true;
                 continue;
             }
 
@@ -1244,9 +1241,24 @@ impl Timeline {
             layers_to_rewrite.push(layer);
         }
 
-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop read lock on layer map before we start doing time-consuming I/O.
         drop(layers);
 
+        // Drop out early if there's nothing to do.
+        if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
+            return Ok(());
+        }
+
+        info!(
+            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
+                (latest_gc_cutoff={} pitr_cutoff={})",
+            layers_to_rewrite.len(),
+            drop_layers.len(),
+            *latest_gc_cutoff,
+            pitr_cutoff,
+        );
+        let started = Instant::now();
+
         let mut replace_image_layers = Vec::new();
 
         for layer in layers_to_rewrite {
@@ -1254,7 +1266,7 @@ impl Timeline {
                 return Err(CompactionError::ShuttingDown);
             }
 
-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            info!(layer=%layer, "rewriting layer after shard split");
             let mut image_layer_writer = ImageLayerWriter::new(
                 self.conf,
                 self.timeline_id,
@@ -1292,7 +1304,7 @@ impl Timeline {
                     .map_err(CompactionError::Other)?;
                 let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
                     .map_err(CompactionError::Other)?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
                     layer.metadata().file_size,
                     new_layer.metadata().file_size);
 
@@ -1304,6 +1316,12 @@ impl Timeline {
             }
         }
 
+        for layer in &drop_layers {
+            info!(%layer, old_metadata=?layer.metadata(),
+                "dropping layer after shard split (no keys for this shard)",
+            );
+        }
+
         // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
         // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
         // to remote index) and be removed. This is inefficient but safe.
@@ -1319,6 +1337,7 @@ impl Timeline {
         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
         // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
         // load.
+        info!("shard ancestor compaction waiting for uploads");
         match self.remote_client.wait_completion().await {
             Ok(()) => (),
             Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
@@ -1327,6 +1346,15 @@ impl Timeline {
             }
         }
 
+        info!(
+            "shard ancestor compaction done in {:.3}s{}",
+            started.elapsed().as_secs_f64(),
+            match rewrite_max_exceeded {
+                true => format!(", more work pending due to rewrite_max={rewrite_max}"),
+                false => String::new(),
+            }
+        );
+
         fail::fail_point!("compact-shard-ancestors-persistent");
 
         Ok(())

From 88f01c1ca1738cc0843f250150dc7e4071e43a2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 11 Apr 2025 16:08:46 +0200
Subject: [PATCH 113/140] Introduce WalIngestError (#11506)

Introduces a `WalIngestError` struct together with a
`WalIngestErrorKind` enum, to be used for walingest related failures and
errors.

* the enum captures backtraces, so we don't regress in comparison to
`anyhow::Error`s (backtraces might be a bit shorter if we use one of the
`anyhow::Error` wrappers)
* it explicitly lists most/all of the potential cases that can occur.

I've originally been inspired to do this in #11496, but it's a
longer-term TODO.
---
 libs/pageserver_api/src/key.rs                |  15 +-
 pageserver/src/http/routes.rs                 |   2 +-
 pageserver/src/import_datadir.rs              |   8 +-
 pageserver/src/pgdatadir_mapping.rs           | 170 ++++++++----------
 .../walreceiver/connection_manager.rs         |   1 +
 .../walreceiver/walreceiver_connection.rs     |  11 +-
 pageserver/src/walingest.rs                   | 154 ++++++++++++----
 7 files changed, 229 insertions(+), 132 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 8836e7ec87..0c4d7fd4cb 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -927,7 +927,7 @@ impl Key {
 
     /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
     #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+    pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
         Ok(match self.field1 {
             0x00 => (
                 RelTag {
@@ -938,7 +938,7 @@ impl Key {
                 },
                 self.field6,
             ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+            _ => return Err(ToRelBlockError(self.field1)),
         })
     }
 }
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
     }
 }
 
+#[derive(Debug)]
+pub struct ToRelBlockError(u8);
+
+impl fmt::Display for ToRelBlockError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "unexpected value kind 0x{:02x}", self.0)
+    }
+}
+
+impl std::error::Error for ToRelBlockError {}
+
 #[cfg(test)]
 mod tests {
     use std::str::FromStr;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9bb761dc48..bbc4bfae1b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
         modification
             .put_file(&fname, content.as_bytes(), &ctx)
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
     }
     modification
         .commit(&ctx)
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 6dd005de50..911449c7c5 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
-use crate::walingest::WalIngest;
+use crate::walingest::{WalIngest, WalIngestErrorKind};
 
 // Returns checkpoint LSN from controlfile
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
         .put_rel_creation(rel, nblocks as u32, ctx)
         .await
     {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
+        match e.kind {
+            WalIngestErrorKind::RelationAlreadyExists(rel) => {
+                debug!("Relation {rel} already exists. We must be extending it.")
             }
             _ => return Err(e.into()),
         }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4c5a07ba57..f33a8baec1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,8 +9,9 @@
 use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};
 
-use crate::PERF_TRACE_TARGET;
-use anyhow::{Context, ensure};
+use crate::walingest::{WalIngestError, WalIngestErrorKind};
+use crate::{PERF_TRACE_TARGET, ensure_walingest};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
@@ -136,12 +137,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
 
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
-    #[error("Relation Already Exists")]
-    AlreadyExists,
     #[error("invalid relnode")]
     InvalidRelnode,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }
 
 ///
@@ -1478,8 +1475,8 @@ impl DatadirModification<'_> {
     }
 
     /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
+        ensure_walingest!(
             lsn >= self.lsn,
             "setting an older lsn {} than {} is not allowed",
             lsn,
@@ -1578,7 +1575,7 @@ impl DatadirModification<'_> {
         &mut self,
         rel: RelTag,
         ctx: &RequestContext,
-    ) -> Result<u32, PageReconstructError> {
+    ) -> Result<u32, WalIngestError> {
         // Get current size and put rel creation if rel doesn't exist
         //
         // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1593,14 +1590,13 @@ impl DatadirModification<'_> {
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
-            self.put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
+            self.put_rel_creation(rel, 0, ctx).await?;
             Ok(0)
         } else {
-            self.tline
+            Ok(self
+                .tline
                 .get_rel_size(rel, Version::Modified(self), ctx)
-                .await
+                .await?)
         }
     }
 
@@ -1637,11 +1633,14 @@ impl DatadirModification<'_> {
         // TODO(vlad): remove this argument and replace the shard check with is_key_local
         shard: &ShardIdentity,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let mut gaps_at_lsns = Vec::default();
 
         for meta in batch.metadata.iter() {
-            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let key = Key::from_compact(meta.key());
+            let (rel, blkno) = key
+                .to_rel_block()
+                .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
             let new_nblocks = blkno + 1;
 
             let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1683,8 +1682,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
     }
@@ -1696,7 +1695,7 @@ impl DatadirModification<'_> {
         segno: u32,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         if !self.tline.tenant_shard_id.is_shard_zero() {
             return Ok(());
         }
@@ -1714,14 +1713,11 @@ impl DatadirModification<'_> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         let key = rel_block_to_key(rel, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
@@ -1733,15 +1729,12 @@ impl DatadirModification<'_> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
         self.put(key, Value::Image(img));
         Ok(())
@@ -1751,15 +1744,11 @@ impl DatadirModification<'_> {
         &mut self,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         let key = rel_block_to_key(rel, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
 
         let batch = self
@@ -1776,15 +1765,11 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
 
         let batch = self
@@ -1832,8 +1817,10 @@ impl DatadirModification<'_> {
         dbnode: Oid,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
 
         // Add it to the directory (if it doesn't exist already)
         let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1874,13 +1861,13 @@ impl DatadirModification<'_> {
         xid: u64,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Add it to the directory entry
         let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let newdirbuf = if self.tline.pg_version >= 17 {
             let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
             if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
             }
             self.pending_directory_entries.push((
                 DirectoryKind::TwoPhase,
@@ -1891,7 +1878,7 @@ impl DatadirModification<'_> {
             let xid = xid as u32;
             let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
             if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
             }
             self.pending_directory_entries.push((
                 DirectoryKind::TwoPhase,
@@ -1909,22 +1896,22 @@ impl DatadirModification<'_> {
         &mut self,
         origin_id: RepOriginId,
         origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let key = repl_origin_key(origin_id);
         self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
         Ok(())
     }
 
-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
         self.set_replorigin(origin_id, Lsn::INVALID).await
     }
 
-    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
         self.put(CHECKPOINT_KEY, Value::Image(img));
         Ok(())
     }
@@ -1934,7 +1921,7 @@ impl DatadirModification<'_> {
         spcnode: Oid,
         dbnode: Oid,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let total_blocks = self
             .tline
             .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1973,20 +1960,21 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
+    ) -> Result<(), WalIngestError> {
         if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
+            Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
+                "invalid relnode"
+            )))?;
         }
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
 
         let dbdir_exists =
             if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                 // Didn't exist. Update dbdir
                 e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                let buf = DbDirectory::ser(&dbdir)?;
                 self.pending_directory_entries.push((
                     DirectoryKind::Db,
                     MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -2003,27 +1991,25 @@ impl DatadirModification<'_> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
         };
 
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
 
         if v2_enabled {
             if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             let sparse_rel_dir_key =
                 rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
             // check if the rel_dir_key exists in v2
-            let val = self
-                .sparse_get(sparse_rel_dir_key, ctx)
-                .await
-                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
             let val = RelDirExists::decode_option(val)
-                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
             if val == RelDirExists::Exists {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             self.put(
                 sparse_rel_dir_key,
@@ -2039,9 +2025,7 @@ impl DatadirModification<'_> {
                 // will be key not found errors if we don't create an empty one for rel_size_v2.
                 self.put(
                     rel_dir_key,
-                    Value::Image(Bytes::from(
-                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
-                    )),
+                    Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
                 );
             }
             self.pending_directory_entries
@@ -2049,7 +2033,7 @@ impl DatadirModification<'_> {
         } else {
             // Add the new relation to the rel directory entry, and write it back
             if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             if !dbdir_exists {
                 self.pending_directory_entries
@@ -2059,9 +2043,7 @@ impl DatadirModification<'_> {
                 .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
             self.put(
                 rel_dir_key,
-                Value::Image(Bytes::from(
-                    RelDirectory::ser(&rel_dir).context("serialize")?,
-                )),
+                Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
             );
         }
 
@@ -2086,8 +2068,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         if self
             .tline
             .get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2117,8 +2099,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
 
         // Put size
         let size_key = rel_size_to_key(rel);
@@ -2142,8 +2124,10 @@ impl DatadirModification<'_> {
         &mut self,
         drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
         for ((spc_node, db_node), rel_tags) in drop_relations {
             let dir_key = rel_dir_to_key(spc_node, db_node);
             let buf = self.get(dir_key, ctx).await?;
@@ -2163,7 +2147,7 @@ impl DatadirModification<'_> {
                     let key =
                         rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
                     let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
-                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                        .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
                     if val == RelDirExists::Exists {
                         self.pending_directory_entries
                             .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2206,7 +2190,7 @@ impl DatadirModification<'_> {
         segno: u32,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         // Add it to the directory entry
@@ -2215,7 +2199,7 @@ impl DatadirModification<'_> {
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
-            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
+            Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
         }
         self.pending_directory_entries.push((
             DirectoryKind::SlruSegment(kind),
@@ -2242,7 +2226,7 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         // Put size
@@ -2258,7 +2242,7 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key, ctx).await?;
@@ -2283,7 +2267,7 @@ impl DatadirModification<'_> {
     }
 
     /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
         // TODO
         Ok(())
     }
@@ -2293,7 +2277,7 @@ impl DatadirModification<'_> {
         &mut self,
         xid: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Remove it from the directory entry
         let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2308,7 +2292,8 @@ impl DatadirModification<'_> {
             ));
             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
         } else {
-            let xid: u32 = u32::try_from(xid)?;
+            let xid: u32 = u32::try_from(xid)
+                .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
             let mut dir = TwoPhaseDirectory::des(&buf)?;
 
             if !dir.xids.remove(&xid) {
@@ -2333,7 +2318,7 @@ impl DatadirModification<'_> {
         path: &str,
         content: &[u8],
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let key = aux_file::encode_aux_file_key(path);
         // retrieve the key from the engine
         let old_val = match self.get(key, ctx).await {
@@ -2342,7 +2327,7 @@ impl DatadirModification<'_> {
             Err(e) => return Err(e.into()),
         };
         let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-            aux_file::decode_file_value(old_val)?
+            aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
         } else {
             Vec::new()
         };
@@ -2387,7 +2372,8 @@ impl DatadirModification<'_> {
             }
             (None, true) => warn!("removing non-existing aux file: {}", path),
         }
-        let new_val = aux_file::encode_file_value(&new_files)?;
+        let new_val = aux_file::encode_file_value(&new_files)
+            .map_err(WalIngestErrorKind::EncodeAuxFileError)?;
         self.put(key, Value::Image(new_val.into()));
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index df2663f6bb..3c3608d1bd 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
                                 );
                                 Ok(())
                             }
+                            WalReceiverError::Cancelled => Ok(()),
                             WalReceiverError::Other(e) => {
                                 // give out an error to have task_mgr give it a really verbose logging
                                 if cancellation.is_cancelled() {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 6bf05a0f86..52259f205b 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
     /// Generic error
     Other(anyhow::Error),
     ClosedGate,
+    Cancelled,
 }
 
 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
                                 // with a similar error.
                             },
                             WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Cancelled => {
+                                debug!("Connection cancelled")
+                            }
                             WalReceiverError::ClosedGate => {
                                 // doesn't happen at runtime
                             }
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
+        .await
+        .map_err(|e| match e.kind {
+            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+            _ => WalReceiverError::Other(e.into()),
+        })?;
 
     let shard = vec![*timeline.get_shard_identity()];
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 18df065f76..e60c590f87 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,13 +21,13 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use std::backtrace::Backtrace;
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};
 
-use anyhow::{Result, bail};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::rel_block_to_key;
+use pageserver_api::key::{Key, rel_block_to_key};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -38,7 +38,7 @@ use postgres_ffi::{
     fsm_logical_to_physical, pg_constants,
 };
 use tracing::*;
-use utils::bin_ser::SerializeError;
+use utils::bin_ser::{DeserializeError, SerializeError};
 use utils::lsn::Lsn;
 use utils::rate_limit::RateLimit;
 use utils::{critical, failpoint_support};
@@ -104,12 +104,101 @@ struct WarnIngestLag {
     timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
+pub struct WalIngestError {
+    pub backtrace: std::backtrace::Backtrace,
+    pub kind: WalIngestErrorKind,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum WalIngestErrorKind {
+    #[error(transparent)]
+    #[allow(private_interfaces)]
+    PageReconstructError(#[from] PageReconstructError),
+    #[error(transparent)]
+    DeserializationFailure(#[from] DeserializeError),
+    #[error(transparent)]
+    SerializationFailure(#[from] SerializeError),
+    #[error("the request contains data not supported by pageserver: {0} @ {1}")]
+    InvalidKey(Key, Lsn),
+    #[error("twophase file for xid {0} already exists")]
+    FileAlreadyExists(u64),
+    #[error("slru segment {0:?}/{1} already exists")]
+    SlruAlreadyExists(SlruKind, u32),
+    #[error("relation already exists")]
+    RelationAlreadyExists(RelTag),
+    #[error("invalid reldir key {0}")]
+    InvalidRelDirKey(Key),
+
+    #[error(transparent)]
+    LogicalError(anyhow::Error),
+    #[error(transparent)]
+    EncodeAuxFileError(anyhow::Error),
+    #[error(transparent)]
+    MaybeRelSizeV2Error(anyhow::Error),
+
+    #[error("timeline shutting down")]
+    Cancelled,
+}
+
+impl<T> From<T> for WalIngestError
+where
+    WalIngestErrorKind: From<T>,
+{
+    fn from(value: T) -> Self {
+        WalIngestError {
+            backtrace: Backtrace::capture(),
+            kind: WalIngestErrorKind::from(value),
+        }
+    }
+}
+
+impl std::error::Error for WalIngestError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        self.kind.source()
+    }
+}
+
+impl core::fmt::Display for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        self.kind.fmt(f)
+    }
+}
+
+impl core::fmt::Debug for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if f.alternate() {
+            f.debug_map()
+                .key(&"backtrace")
+                .value(&self.backtrace)
+                .key(&"kind")
+                .value(&self.kind)
+                .finish()
+        } else {
+            writeln!(f, "Error: {:?}", self.kind)?;
+            if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+                writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
+            }
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! ensure_walingest {
+    ($($t:tt)*) => {
+        _ = || -> Result<(), anyhow::Error> {
+            anyhow::ensure!($($t)*);
+            Ok(())
+        }().map_err(WalIngestErrorKind::LogicalError)?;
+    };
+}
+
 impl WalIngest {
     pub async fn new(
         timeline: &Timeline,
         startpoint: Lsn,
         ctx: &RequestContext,
-    ) -> anyhow::Result<WalIngest> {
+    ) -> Result<WalIngest, WalIngestError> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -145,7 +234,7 @@ impl WalIngest {
         interpreted: InterpretedWalRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<bool> {
+    ) -> Result<bool, WalIngestError> {
         WAL_INGEST.records_received.inc();
         let prev_len = modification.len();
 
@@ -288,7 +377,7 @@ impl WalIngest {
     }
 
     /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
-    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
         let next_full_xid =
             enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
 
@@ -298,9 +387,9 @@ impl WalIngest {
         if xid > next_xid {
             // Wraparound occurred, must be from a prev epoch.
             if epoch == 0 {
-                bail!(
+                Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
                     "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
-                );
+                )))?;
             }
             epoch -= 1;
         }
@@ -313,7 +402,7 @@ impl WalIngest {
         clear_vm_bits: ClearVmBits,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClearVmBits {
             new_heap_blkno,
             old_heap_blkno,
@@ -402,7 +491,7 @@ impl WalIngest {
         create: DbaseCreate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let DbaseCreate {
             db_id,
             tablespace_id,
@@ -505,7 +594,7 @@ impl WalIngest {
         dbase_drop: DbaseDrop,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let DbaseDrop {
             db_id,
             tablespace_ids,
@@ -523,7 +612,7 @@ impl WalIngest {
         create: SmgrCreate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let SmgrCreate { rel } = create;
         self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
@@ -537,7 +626,7 @@ impl WalIngest {
         truncate: XlSmgrTruncate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let XlSmgrTruncate {
             blkno,
             rnode,
@@ -689,7 +778,7 @@ impl WalIngest {
         record: XactRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let (xact_common, is_commit, is_prepared) = match record {
             XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
                 let xid: u64 = if modification.tline.pg_version >= 17 {
@@ -813,7 +902,7 @@ impl WalIngest {
         truncate: ClogTruncate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClogTruncate {
             pageno,
             oldest_xid,
@@ -889,7 +978,7 @@ impl WalIngest {
         zero_page: ClogZeroPage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClogZeroPage { segno, rpageno } = zero_page;
 
         self.put_slru_page_image(
@@ -907,7 +996,7 @@ impl WalIngest {
         &mut self,
         modification: &mut DatadirModification,
         xlrec: &XlMultiXactCreate,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Create WAL record for updating the multixact-offsets page
         let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
         let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1010,7 +1099,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let (maxsegment, startsegment, endsegment) =
             enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
                 cp.oldestMulti = xlrec.end_trunc_off;
@@ -1058,7 +1147,7 @@ impl WalIngest {
         zero_page: MultiXactZeroPage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let MultiXactZeroPage {
             slru_kind,
             segno,
@@ -1080,7 +1169,7 @@ impl WalIngest {
         update: RelmapUpdate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let RelmapUpdate { update, buf } = update;
 
         modification
@@ -1093,7 +1182,7 @@ impl WalIngest {
         raw_record: RawXlogRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let RawXlogRecord { info, lsn, mut buf } = raw_record;
         let pg_version = modification.tline.pg_version;
 
@@ -1235,12 +1324,12 @@ impl WalIngest {
         put: PutLogicalMessage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let PutLogicalMessage { path, buf } = put;
         modification.put_file(path.as_str(), &buf, ctx).await
     }
 
-    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
+    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
         match record {
             StandbyRecord::RunningXacts(running_xacts) => {
                 enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
@@ -1258,7 +1347,7 @@ impl WalIngest {
         &mut self,
         record: ReploriginRecord,
         modification: &mut DatadirModification<'_>,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         match record {
             ReploriginRecord::Set(set) => {
                 modification
@@ -1278,7 +1367,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         modification.put_rel_creation(rel, 0, ctx).await?;
         Ok(())
     }
@@ -1291,7 +1380,7 @@ impl WalIngest {
         blknum: BlockNumber,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
         self.handle_rel_extend(modification, rel, blknum, ctx)
             .await?;
         modification.put_rel_page_image(rel, blknum, img)?;
@@ -1305,7 +1394,7 @@ impl WalIngest {
         blknum: BlockNumber,
         rec: NeonWalRecord,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         self.handle_rel_extend(modification, rel, blknum, ctx)
             .await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
@@ -1318,7 +1407,7 @@ impl WalIngest {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         modification.put_rel_truncation(rel, nblocks, ctx).await?;
         Ok(())
     }
@@ -1329,7 +1418,7 @@ impl WalIngest {
         rel: RelTag,
         blknum: BlockNumber,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
@@ -1423,7 +1512,7 @@ impl WalIngest {
         blknum: BlockNumber,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         if !self.shard.is_shard_zero() {
             return Ok(());
         }
@@ -1441,7 +1530,7 @@ impl WalIngest {
         segno: u32,
         blknum: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
         // a lot less frequently.
@@ -1509,6 +1598,7 @@ async fn get_relsize(
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
+    use anyhow::Result;
     use postgres_ffi::RELSEG_SIZE;
 
     use super::*;
@@ -1530,7 +1620,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
         for i in 14..=16 {
             dispatch_pgversion!(i, {
                 pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;

From c66444ea1538349d13ab5e87bca880394434004b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 11 Apr 2025 16:10:27 +0200
Subject: [PATCH 114/140] Add timeline_import http endpoint (#11484)

The added `timleine_import` endpoint allows us to migrate safekeeper
timelines from control plane managed to storcon managed.

Part of #9011
---
 libs/pageserver_api/src/controller_api.rs     | 12 +++++-
 storage_controller/src/http.rs                | 42 +++++++++++++++++++
 .../src/service/safekeeper_service.rs         | 30 ++++++++++++-
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 3cb62f9d18..91f9c03ba4 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
 
 use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
 use crate::shard::{ShardStripeSize, TenantShardId};
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+/// Import request for safekeeper timelines.
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TimelineImportRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub start_lsn: Lsn,
+    pub sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
     use serde_json;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 4f3613b687..fb4530d0d2 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
     NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
     ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
+    TimelineImportRequest,
 };
 use pageserver_api::models::{
     DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -1286,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
     )
 }
 
+async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
+            import_req.tenant_id,
+            import_req.timeline_id
+        )));
+    }
+
+    json_response(
+        StatusCode::OK,
+        state.service.timeline_import(import_req).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1959,6 +1991,16 @@ pub fn make_router(
                 RequestName("debug_v1_tenant_locate"),
             )
         })
+        .post(
+            "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
+            |r| {
+                named_request_span(
+                    r,
+                    handle_timeline_import,
+                    RequestName("debug_v1_timeline_import"),
+                )
+            },
+        )
         .get("/debug/v1/scheduler", |r| {
             named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
         })
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 099d0305ba..a23b9a4a02 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -12,13 +12,16 @@ use crate::persistence::{
 use crate::safekeeper::Safekeeper;
 use anyhow::Context;
 use http_utils::error::ApiError;
-use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
+use pageserver_api::controller_api::{
+    SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
+};
 use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::membership::{MemberSet, SafekeeperId};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
+use utils::lsn::Lsn;
 
 use super::Service;
 
@@ -298,6 +301,31 @@ impl Service {
             timeline_id,
         })
     }
+
+    /// Directly insert the timeline into the database without reconciling it with safekeepers.
+    ///
+    /// Useful if the timeline already exists on the specified safekeepers,
+    /// but we want to make it storage controller managed.
+    pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
+        let persistence = TimelinePersistence {
+            tenant_id: req.tenant_id.to_string(),
+            timeline_id: req.timeline_id.to_string(),
+            start_lsn: Lsn::INVALID.into(),
+            generation: 1,
+            sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
+            new_sk_set: None,
+            cplane_notified_generation: 1,
+            deleted_at: None,
+        };
+        let inserted = self.persistence.insert_timeline(persistence).await?;
+        if inserted {
+            tracing::info!("imported timeline into db");
+        } else {
+            tracing::info!("didn't import timeline into db, as it is already present in db");
+        }
+        Ok(())
+    }
+
     /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
     pub(super) async fn tenant_timeline_delete_safekeepers(
         self: &Arc<Self>,

From ff5a52716736e3fe8ee808cc7f686f961f7cd34a Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 11 Apr 2025 10:06:29 -0500
Subject: [PATCH 115/140] Consolidate compute_ctl configuration structures
 (#11514)

Previously, the structure of the spec file was just the compute spec.
However, the response from the control plane get spec request included
the compute spec and the compute_ctl config. This divergence was
hindering other work such as adding regression tests for compute_ctl
HTTP authorization.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs          |  52 ++---
 compute_tools/src/metrics.rs                  |   4 +-
 compute_tools/src/spec.rs                     |  50 ++---
 control_plane/src/endpoint.rs                 | 196 ++++++++++--------
 .../compute_wrapper/shell/compute.sh          |  28 ++-
 .../var/db/postgres/configs/config.json       | 148 +++++++++++++
 .../var/db/postgres/specs/spec.json           | 141 -------------
 docker-compose/docker-compose.yml             |   2 +-
 libs/compute_api/src/responses.rs             |  28 ++-
 libs/compute_api/src/spec.rs                  |  10 +-
 test_runner/fixtures/neon_fixtures.py         |  22 +-
 test_runner/regress/test_compute_catalog.py   | 190 +++++++++--------
 .../regress/test_compute_reconfigure.py       |  20 +-
 .../regress/test_subscriber_branching.py      |  12 +-
 14 files changed, 491 insertions(+), 412 deletions(-)
 create mode 100644 docker-compose/compute_wrapper/var/db/postgres/configs/config.json
 delete mode 100644 docker-compose/compute_wrapper/var/db/postgres/specs/spec.json

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 4796a07d92..ea8350e2f5 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -29,13 +29,12 @@
 //! ```sh
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
-//!             -S /var/db/postgres/specs/current.json \
+//!             -c /var/db/postgres/configs/config.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::ffi::OsString;
 use std::fs::File;
-use std::path::Path;
 use std::process::exit;
 use std::sync::mpsc;
 use std::thread;
@@ -43,8 +42,7 @@ use std::time::Duration;
 
 use anyhow::{Context, Result};
 use clap::Parser;
-use compute_api::responses::ComputeCtlConfig;
-use compute_api::spec::ComputeSpec;
+use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
     BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
 };
@@ -118,8 +116,10 @@ struct Cli {
     #[arg(long)]
     pub set_disk_quota_for_fs: Option<String>,
 
-    #[arg(short = 'S', long, group = "spec-path")]
-    pub spec_path: Option<OsString>,
+    // TODO(tristan957): remove alias after compatibility tests are no longer
+    // an issue
+    #[arg(short = 'c', long, alias = "spec-path")]
+    pub config: Option<OsString>,
 
     #[arg(short = 'i', long, group = "compute-id")]
     pub compute_id: String,
@@ -127,8 +127,9 @@ struct Cli {
     #[arg(
         short = 'p',
         long,
-        conflicts_with = "spec-path",
-        value_name = "CONTROL_PLANE_API_BASE_URL"
+        conflicts_with = "config",
+        value_name = "CONTROL_PLANE_API_BASE_URL",
+        requires = "compute-id"
     )]
     pub control_plane_uri: Option<String>,
 }
@@ -154,7 +155,7 @@ fn main() -> Result<()> {
 
     let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
 
-    let cli_spec = try_spec_from_cli(&cli)?;
+    let cli_spec = get_config(&cli)?;
 
     let compute_node = ComputeNode::new(
         ComputeNodeParams {
@@ -201,27 +202,17 @@ async fn init() -> Result<()> {
     Ok(())
 }
 
-fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
-    // First, read spec from the path if provided
-    if let Some(ref spec_path) = cli.spec_path {
-        let file = File::open(Path::new(spec_path))?;
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_reader(file)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-        });
+fn get_config(cli: &Cli) -> Result<ComputeConfig> {
+    // First, read the config from the path if provided
+    if let Some(ref config) = cli.config {
+        let file = File::open(config)?;
+        return Ok(serde_json::from_reader(&file)?);
     }
 
-    if cli.control_plane_uri.is_none() {
-        panic!("must specify --control-plane-uri");
-    };
-
-    // If the spec wasn't provided in the CLI arguments, then retrieve it from
+    // If the config wasn't provided in the CLI arguments, then retrieve it from
     // the control plane
-    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
-        Ok(resp) => Ok(CliSpecParams {
-            spec: resp.0,
-            compute_ctl_config: resp.1,
-        }),
+    match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
+        Ok(config) => Ok(config),
         Err(e) => {
             error!(
                 "cannot get response from control plane: {}\n\
@@ -233,13 +224,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     }
 }
 
-struct CliSpecParams {
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    #[allow(dead_code)]
-    compute_ctl_config: ComputeCtlConfig,
-}
-
 fn deinit_and_exit(exit_code: Option<i32>) -> ! {
     // Shutdown trace pipeline gracefully, so that it has a chance to send any
     // pending traces before we exit. Shutting down OTEL tracing provider may
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 52f1795703..fa00476fd2 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
 // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
 // And it's fair to call it a 'RPC' (Remote Procedure Call).
 pub enum CPlaneRequestRPC {
-    GetSpec,
+    GetConfig,
 }
 
 impl CPlaneRequestRPC {
     pub fn as_str(&self) -> &str {
         match self {
-            CPlaneRequestRPC::GetSpec => "GetSpec",
+            CPlaneRequestRPC::GetConfig => "GetConfig",
         }
     }
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index a76af21e9f..4b38e6e29c 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -3,9 +3,8 @@ use std::path::Path;
 
 use anyhow::{Result, anyhow, bail};
 use compute_api::responses::{
-    ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
+    ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
 };
-use compute_api::spec::ComputeSpec;
 use reqwest::StatusCode;
 use tokio_postgres::Client;
 use tracing::{error, info, instrument};
@@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5;
 fn do_control_plane_request(
     uri: &str,
     jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
+) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
     let resp = reqwest::blocking::Client::new()
         .get(uri)
         .header("Authorization", format!("Bearer {}", jwt))
@@ -29,14 +28,14 @@ fn do_control_plane_request(
         .map_err(|e| {
             (
                 true,
-                format!("could not perform spec request to control plane: {:?}", e),
+                format!("could not perform request to control plane: {:?}", e),
                 UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
 
     let status = resp.status();
     match status {
-        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+        StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
@@ -69,40 +68,35 @@ fn do_control_plane_request(
     }
 }
 
-/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
-/// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(
-    base_uri: &str,
-    compute_id: &str,
-) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
+/// Request config from the control-plane by compute_id. If
+/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
+/// authorization.
+pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
     let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
-        Ok(v) => v,
-        Err(_) => "".to_string(),
-    };
+    let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
     let mut attempt = 1;
 
-    info!("getting spec from control plane: {}", cp_uri);
+    info!("getting config from control plane: {}", cp_uri);
 
     // Do 3 attempts to get spec from the control plane using the following logic:
     // - network error -> then retry
     // - compute id is unknown or any other error -> bail out
     // - no spec for compute yet (Empty state) -> return Ok(None)
-    // - got spec -> return Ok(Some(spec))
+    // - got config -> return Ok(Some(config))
     while attempt < 4 {
         let result = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => {
+            Ok(config_resp) => {
                 CPLANE_REQUESTS_TOTAL
                     .with_label_values(&[
-                        CPlaneRequestRPC::GetSpec.as_str(),
+                        CPlaneRequestRPC::GetConfig.as_str(),
                         &StatusCode::OK.to_string(),
                     ])
                     .inc();
-                match spec_resp.status {
-                    ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
+                match config_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
                     ControlPlaneComputeStatus::Attached => {
-                        if let Some(spec) = spec_resp.spec {
-                            Ok((Some(spec), spec_resp.compute_ctl_config))
+                        if config_resp.spec.is_some() {
+                            Ok(config_resp.into())
                         } else {
                             bail!("compute is attached, but spec is empty")
                         }
@@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane(
             }
             Err((retry, msg, status)) => {
                 CPLANE_REQUESTS_TOTAL
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
+                    .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
                     .inc();
                 if retry {
                     Err(anyhow!(msg))
@@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane(
         };
 
         if let Err(e) = &result {
-            error!("attempt {} to get spec failed with: {}", attempt, e);
+            error!("attempt {} to get config failed with: {}", attempt, e);
         } else {
             return result;
         }
@@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane(
 
     // All attempts failed, return error.
     Err(anyhow::anyhow!(
-        "Exhausted all attempts to retrieve the spec from the control plane"
+        "Exhausted all attempts to retrieve the config from the control plane"
     ))
 }
 
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
     let pghba_path = pgdata_path.join("pg_hba.conf");
 
     if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
 
 /// Create a standby.signal file
 pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
     let signalfile = pgdata_path.join("standby.signal");
 
     if !signalfile.exists() {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 663c024953..2fa7a62f8f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -29,7 +29,7 @@
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
 //!     postgresql.conf           - postgresql settings
-//!     spec.json                 - passed to `compute_ctl`
+//!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
 //!         zenith.signal
@@ -46,7 +46,9 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
 use anyhow::{Context, Result, anyhow, bail};
 use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
+use compute_api::responses::{
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse,
+};
 use compute_api::spec::{
     Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
     RemoteExtSpec, Role,
@@ -619,90 +621,101 @@ impl Endpoint {
             remote_extensions = None;
         };
 
-        // Create spec file
-        let mut spec = ComputeSpec {
-            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
-            format_version: 1.0,
-            operation_uuid: None,
-            features: self.features.clone(),
-            swap_size_bytes: None,
-            disk_quota_bytes: None,
-            disable_lfc_resizing: None,
-            cluster: Cluster {
-                cluster_id: None, // project ID: not used
-                name: None,       // project name: not used
-                state: None,
-                roles: if create_test_user {
-                    vec![Role {
+        // Create config file
+        let config = {
+            let mut spec = ComputeSpec {
+                skip_pg_catalog_updates: self.skip_pg_catalog_updates,
+                format_version: 1.0,
+                operation_uuid: None,
+                features: self.features.clone(),
+                swap_size_bytes: None,
+                disk_quota_bytes: None,
+                disable_lfc_resizing: None,
+                cluster: Cluster {
+                    cluster_id: None, // project ID: not used
+                    name: None,       // project name: not used
+                    state: None,
+                    roles: if create_test_user {
+                        vec![Role {
+                            name: PgIdent::from_str("test").unwrap(),
+                            encrypted_password: None,
+                            options: None,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    databases: if create_test_user {
+                        vec![Database {
+                            name: PgIdent::from_str("neondb").unwrap(),
+                            owner: PgIdent::from_str("test").unwrap(),
+                            options: None,
+                            restrict_conn: false,
+                            invalid: false,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    settings: None,
+                    postgresql_conf: Some(postgresql_conf.clone()),
+                },
+                delta_operations: None,
+                tenant_id: Some(self.tenant_id),
+                timeline_id: Some(self.timeline_id),
+                project_id: None,
+                branch_id: None,
+                endpoint_id: Some(self.endpoint_id.clone()),
+                mode: self.mode,
+                pageserver_connstring: Some(pageserver_connstring),
+                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
+                safekeeper_connstrings,
+                storage_auth_token: auth_token.clone(),
+                remote_extensions,
+                pgbouncer_settings: None,
+                shard_stripe_size: Some(shard_stripe_size),
+                local_proxy_config: None,
+                reconfigure_concurrency: self.reconfigure_concurrency,
+                drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+                audit_log_level: ComputeAudit::Disabled,
+                logs_export_host: None::<String>,
+            };
+
+            // this strange code is needed to support respec() in tests
+            if self.cluster.is_some() {
+                debug!("Cluster is already set in the endpoint spec, using it");
+                spec.cluster = self.cluster.clone().unwrap();
+
+                debug!("spec.cluster {:?}", spec.cluster);
+
+                // fill missing fields again
+                if create_test_user {
+                    spec.cluster.roles.push(Role {
                         name: PgIdent::from_str("test").unwrap(),
                         encrypted_password: None,
                         options: None,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                databases: if create_test_user {
-                    vec![Database {
+                    });
+                    spec.cluster.databases.push(Database {
                         name: PgIdent::from_str("neondb").unwrap(),
                         owner: PgIdent::from_str("test").unwrap(),
                         options: None,
                         restrict_conn: false,
                         invalid: false,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                settings: None,
-                postgresql_conf: Some(postgresql_conf.clone()),
-            },
-            delta_operations: None,
-            tenant_id: Some(self.tenant_id),
-            timeline_id: Some(self.timeline_id),
-            project_id: None,
-            branch_id: None,
-            endpoint_id: Some(self.endpoint_id.clone()),
-            mode: self.mode,
-            pageserver_connstring: Some(pageserver_connstring),
-            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
-            safekeeper_connstrings,
-            storage_auth_token: auth_token.clone(),
-            remote_extensions,
-            pgbouncer_settings: None,
-            shard_stripe_size: Some(shard_stripe_size),
-            local_proxy_config: None,
-            reconfigure_concurrency: self.reconfigure_concurrency,
-            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
-            audit_log_level: ComputeAudit::Disabled,
-            logs_export_host: None::<String>,
+                    });
+                }
+                spec.cluster.postgresql_conf = Some(postgresql_conf);
+            }
+
+            ComputeConfig {
+                spec: Some(spec),
+                compute_ctl_config: ComputeCtlConfig::default(),
+            }
         };
 
-        // this strange code is needed to support respec() in tests
-        if self.cluster.is_some() {
-            debug!("Cluster is already set in the endpoint spec, using it");
-            spec.cluster = self.cluster.clone().unwrap();
-
-            debug!("spec.cluster {:?}", spec.cluster);
-
-            // fill missing fields again
-            if create_test_user {
-                spec.cluster.roles.push(Role {
-                    name: PgIdent::from_str("test").unwrap(),
-                    encrypted_password: None,
-                    options: None,
-                });
-                spec.cluster.databases.push(Database {
-                    name: PgIdent::from_str("neondb").unwrap(),
-                    owner: PgIdent::from_str("test").unwrap(),
-                    options: None,
-                    restrict_conn: false,
-                    invalid: false,
-                });
-            }
-            spec.cluster.postgresql_conf = Some(postgresql_conf);
-        }
-
+        // TODO(tristan957): Remove the write to spec.json after compatibility
+        // tests work themselves out
         let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
+        let config_path = self.endpoint_path().join("config.json");
+        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
 
         // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
         let logfile = std::fs::OpenOptions::new()
@@ -710,6 +723,16 @@ impl Endpoint {
             .append(true)
             .open(self.endpoint_path().join("compute.log"))?;
 
+        // TODO(tristan957): Remove when compatibility tests are no longer an
+        // issue
+        let old_compute_ctl = {
+            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+            let help_output = cmd.arg("--help").output()?;
+            let help_output = String::from_utf8_lossy(&help_output.stdout);
+
+            !help_output.contains("--config")
+        };
+
         // Launch compute_ctl
         let conn_str = self.connstr("cloud_admin", "postgres");
         println!("Starting postgres node at '{}'", conn_str);
@@ -728,9 +751,18 @@ impl Endpoint {
         ])
         .args(["--pgdata", self.pgdata().to_str().unwrap()])
         .args(["--connstr", &conn_str])
+        // TODO(tristan957): Change this to --config when compatibility tests
+        // are no longer an issue
         .args([
             "--spec-path",
-            self.endpoint_path().join("spec.json").to_str().unwrap(),
+            self.endpoint_path()
+                .join(if old_compute_ctl {
+                    "spec.json"
+                } else {
+                    "config.json"
+                })
+                .to_str()
+                .unwrap(),
         ])
         .args([
             "--pgbin",
@@ -873,10 +905,12 @@ impl Endpoint {
         stripe_size: Option<ShardStripeSize>,
         safekeepers: Option<Vec<NodeId>>,
     ) -> Result<()> {
-        let mut spec: ComputeSpec = {
-            let spec_path = self.endpoint_path().join("spec.json");
-            let file = std::fs::File::open(spec_path)?;
-            serde_json::from_reader(file)?
+        let (mut spec, compute_ctl_config) = {
+            let config_path = self.endpoint_path().join("config.json");
+            let file = std::fs::File::open(config_path)?;
+            let config: ComputeConfig = serde_json::from_reader(file)?;
+
+            (config.spec.unwrap(), config.compute_ctl_config)
         };
 
         let postgresql_conf = self.read_postgresql_conf()?;
@@ -926,7 +960,7 @@ impl Endpoint {
             .body(
                 serde_json::to_string(&ConfigurationRequest {
                     spec,
-                    compute_ctl_config: ComputeCtlConfig::default(),
+                    compute_ctl_config,
                 })
                 .unwrap(),
             )
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 418aaf876d..9409e9d055 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -11,8 +11,8 @@ generate_id() {
 
 PG_VERSION=${PG_VERSION:-14}
 
-SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
-SPEC_FILE=/tmp/spec.json
+CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
+CONFIG_FILE=/tmp/config.json
 
 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
 done
 echo "Page server is ready."
 
-cp ${SPEC_FILE_ORG} ${SPEC_FILE}
+cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
 
  if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
    tenant_id=${TENANT_ID}
@@ -73,17 +73,27 @@ else
   ulid_extension=ulid
 fi
 echo "Adding pgx_ulid"
-shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
-sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
+shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
+sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
 echo "Overwrite tenant id and timeline id in spec file"
-sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
 
-cat ${SPEC_FILE}
+cat ${CONFIG_FILE}
+
+# TODO(tristan957): Remove these workarounds for backwards compatibility after
+# the next compute release. That includes these next few lines and the
+# --spec-path in the compute_ctl invocation.
+if compute_ctl --help | grep --quiet -- '--config'; then
+  SPEC_PATH="$CONFIG_FILE"
+else
+  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
+  SPEC_PATH=/tmp/spec.json
+fi
 
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
      --compute-id "compute-$RANDOM"                          \
-     -S ${SPEC_FILE}
+     --spec-path "$SPEC_PATH"
diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
new file mode 100644
index 0000000000..3ddf96512a
--- /dev/null
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -0,0 +1,148 @@
+{
+    "spec": {
+        "format_version": 1.0,
+
+        "timestamp": "2022-10-12T18:00:00.000Z",
+        "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+        "cluster": {
+            "cluster_id": "docker_compose",
+            "name": "docker_compose_test",
+            "state": "restarted",
+            "roles": [
+                {
+                    "name": "cloud_admin",
+                    "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                    "options": null
+                }
+            ],
+            "databases": [
+            ],
+            "settings": [
+                {
+                    "name": "fsync",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "wal_level",
+                    "value": "logical",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "wal_log_hints",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "log_connections",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "port",
+                    "value": "55433",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "shared_buffers",
+                    "value": "1MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_connections",
+                    "value": "100",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "listen_addresses",
+                    "value": "0.0.0.0",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_wal_senders",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "max_replication_slots",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "wal_sender_timeout",
+                    "value": "5s",
+                    "vartype": "string"
+                },
+                {
+                    "name": "wal_keep_size",
+                    "value": "0",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "password_encryption",
+                    "value": "md5",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "restart_after_crash",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "synchronous_standby_names",
+                    "value": "walproposer",
+                    "vartype": "string"
+                },
+                {
+                    "name": "shared_preload_libraries",
+                    "value": "neon,pg_cron,timescaledb,pg_stat_statements",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.safekeepers",
+                    "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.timeline_id",
+                    "value": "TIMELINE_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.tenant_id",
+                    "value": "TENANT_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.pageserver_connstring",
+                    "value": "host=pageserver port=6400",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_write_lag",
+                    "value": "500MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_flush_lag",
+                    "value": "10GB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "cron.database",
+                    "value": "postgres",
+                    "vartype": "string"
+                }
+            ]
+        },
+
+        "delta_operations": [
+        ]
+    },
+    "compute_ctl_config": {
+        "jwks": {
+            "keys": []
+        }
+    }
+}
diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
deleted file mode 100644
index 0308cab451..0000000000
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ /dev/null
@@ -1,141 +0,0 @@
-{
-    "format_version": 1.0,
-
-    "timestamp": "2022-10-12T18:00:00.000Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
-
-    "cluster": {
-        "cluster_id": "docker_compose",
-        "name": "docker_compose_test",
-        "state": "restarted",
-        "roles": [
-            {
-                "name": "cloud_admin",
-                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
-                "options": null
-            }
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "logical",
-                "vartype": "enum"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "port",
-                "value": "55433",
-                "vartype": "integer"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "1MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "5s",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_keep_size",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "restart_after_crash",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": "TIMELINE_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": "TENANT_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": "host=pageserver port=6400",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_write_lag",
-                "value": "500MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_flush_lag",
-                "value": "10GB",
-                "vartype": "string"
-            },
-            {
-                "name": "cron.database",
-                "value": "postgres",
-                "vartype": "string"
-            }
-        ]
-    },
-
-    "delta_operations": [
-    ]
-}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 493a0a5523..fd3ad1fffc 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -159,7 +159,7 @@ services:
       #- RUST_BACKTRACE=1
     # Mount the test files directly, for faster editing cycle.
     volumes:
-      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
       - ./compute_wrapper/shell/:/shell/
     ports:
       - 55433:55433 # pg protocol handler
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index c8f6019c5c..353949736b 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,6 +14,32 @@ pub struct GenericAPIError {
     pub error: String,
 }
 
+/// All configuration parameters necessary for a compute. When
+/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
+/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
+/// and contains parameters necessary for operating `compute_ctl` independently
+/// of whether a tenant is attached to the compute or not.
+///
+/// This also happens to be the body of `compute_ctl`'s /configure request.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct ComputeConfig {
+    /// The compute spec
+    pub spec: Option<ComputeSpec>,
+
+    /// The compute_ctl configuration
+    #[allow(dead_code)]
+    pub compute_ctl_config: ComputeCtlConfig,
+}
+
+impl From<ControlPlaneConfigResponse> for ComputeConfig {
+    fn from(value: ControlPlaneConfigResponse) -> Self {
+        Self {
+            spec: value.spec,
+            compute_ctl_config: value.compute_ctl_config,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
     pub extension: PgIdent,
@@ -161,7 +187,7 @@ pub struct TlsConfig {
 
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 #[derive(Deserialize, Debug)]
-pub struct ControlPlaneSpecResponse {
+pub struct ControlPlaneConfigResponse {
     pub spec: Option<ComputeSpec>,
     pub status: ControlPlaneComputeStatus,
     pub compute_ctl_config: ComputeCtlConfig,
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 82950bcbaa..5e67ccce00 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -1,8 +1,8 @@
-//! `ComputeSpec` represents the contents of the spec.json file.
-//!
-//! The spec.json file is used to pass information to 'compute_ctl'. It contains
-//! all the information needed to start up the right version of PostgreSQL,
-//! and connect it to the storage nodes.
+//! The ComputeSpec contains all the information needed to start up
+//! the right version of PostgreSQL, and connect it to the storage nodes.
+//! It can be passed as part of the `config.json`, or the control plane can
+//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
+//! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;
 
 use indexmap::IndexMap;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9d4068b583..ba2101e427 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4302,10 +4302,10 @@ class Endpoint(PgProtocol, LogUtils):
 
     def respec_deep(self, **kwargs: Any) -> None:
         """
-        Update the spec.json file taking into account nested keys.
-        Distinct method from respec() to not break existing functionality.
-        NOTE: This method also updates the spec.json file, not endpoint.json.
-        We need it because neon_local also writes to spec.json, so intended
+        Update the endpoint.json file taking into account nested keys.
+        Distinct method from respec() to do not break existing functionality.
+        NOTE: This method also updates the config.json file, not endpoint.json.
+        We need it because neon_local also writes to config.json, so intended
         use-case is i) start endpoint with some config, ii) respec_deep(),
         iii) call reconfigure() to apply the changes.
         """
@@ -4318,17 +4318,17 @@ class Endpoint(PgProtocol, LogUtils):
                     curr[k] = v
             return curr
 
-        config_path = os.path.join(self.endpoint_path(), "spec.json")
+        config_path = os.path.join(self.endpoint_path(), "config.json")
         with open(config_path) as f:
-            data_dict: dict[str, Any] = json.load(f)
+            config: dict[str, Any] = json.load(f)
 
-        log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
+        log.debug("Current compute config: %s", json.dumps(config, indent=4))
 
-        update(data_dict, kwargs)
+        update(config, kwargs)
 
         with open(config_path, "w") as file:
-            log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
-            json.dump(data_dict, file, indent=4)
+            log.debug("Updating compute config to: %s", json.dumps(config, indent=4))
+            json.dump(config, file, indent=4)
 
     def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
         """
@@ -4345,7 +4345,7 @@ class Endpoint(PgProtocol, LogUtils):
             wait_until(check_migrations_done)
 
     # Mock the extension part of spec passed from control plane for local testing
-    # endpooint.rs adds content of this file as a part of the spec.json
+    # endpooint.rs adds content of this file as a part of the config.json
     def create_remote_extension_spec(self, spec: dict[str, Any]):
         """Create a remote extension spec file for the endpoint."""
         remote_extensions_spec_path = os.path.join(
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index c1f05830b7..37208c9fff 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -90,10 +90,12 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to create some test databases.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": TEST_ROLE_NAMES,
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": TEST_ROLE_NAMES,
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -155,10 +157,12 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": TEST_ROLE_NAMES,
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": TEST_ROLE_NAMES,
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -196,12 +200,14 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [],
-                "databases": [],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [],
+                    "databases": [],
+                },
+                "delta_operations": delta_operations,
             },
-            "delta_operations": delta_operations,
         }
     )
     endpoint.reconfigure()
@@ -250,9 +256,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -306,17 +314,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES_NEW,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES_NEW,
+                },
+                "delta_operations": [
+                    {"action": "delete_db", "name": SUB_DB_NAME},
+                    # also test the case when we try to delete a non-existent database
+                    # shouldn't happen in normal operation,
+                    # but can occur when failed operations are retried
+                    {"action": "delete_db", "name": "nonexistent_db"},
+                ],
             },
-            "delta_operations": [
-                {"action": "delete_db", "name": SUB_DB_NAME},
-                # also test the case when we try to delete a non-existent database
-                # shouldn't happen in normal operation,
-                # but can occur when failed operations are retried
-                {"action": "delete_db", "name": "nonexistent_db"},
-            ],
         }
     )
 
@@ -354,25 +364,27 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [
-                    {
-                        # We need to create role via compute_ctl, because in this case it will receive
-                        # additional grants equivalent to our real environment, so we can repro some
-                        # issues.
-                        "name": "neon",
-                        # Some autocomplete-suggested hash, no specific meaning.
-                        "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
-                        "options": [],
-                    },
-                ],
-                "databases": [
-                    {
-                        "name": TEST_DB_NAME,
-                        "owner": "neon",
-                    },
-                ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [
+                        {
+                            # We need to create role via compute_ctl, because in this case it will receive
+                            # additional grants equivalent to our real environment, so we can repro some
+                            # issues.
+                            "name": "neon",
+                            # Some autocomplete-suggested hash, no specific meaning.
+                            "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
+                            "options": [],
+                        },
+                    ],
+                    "databases": [
+                        {
+                            "name": TEST_DB_NAME,
+                            "owner": "neon",
+                        },
+                    ],
+                },
             },
         }
     )
@@ -415,13 +427,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
     # Drop role via compute_ctl
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": TEST_GRANTEE,
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": TEST_GRANTEE,
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
@@ -444,13 +458,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": "readonly2",
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": "readonly2",
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
@@ -475,25 +491,27 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
     endpoint = env.endpoints.create_start("main")
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [
-                    {
-                        # We need to create role via compute_ctl, because in this case it will receive
-                        # additional grants equivalent to our real environment, so we can repro some
-                        # issues.
-                        "name": TEST_GRANTOR,
-                        # Some autocomplete-suggested hash, no specific meaning.
-                        "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
-                        "options": [],
-                    },
-                ],
-                "databases": [
-                    {
-                        "name": TEST_DB_NAME,
-                        "owner": TEST_GRANTOR,
-                    },
-                ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [
+                        {
+                            # We need to create role via compute_ctl, because in this case it will receive
+                            # additional grants equivalent to our real environment, so we can repro some
+                            # issues.
+                            "name": TEST_GRANTOR,
+                            # Some autocomplete-suggested hash, no specific meaning.
+                            "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
+                            "options": [],
+                        },
+                    ],
+                    "databases": [
+                        {
+                            "name": TEST_DB_NAME,
+                            "owner": TEST_GRANTOR,
+                        },
+                    ],
+                },
             },
         }
     )
@@ -507,13 +525,15 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": TEST_GRANTEE,
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": TEST_GRANTEE,
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py
index 6396ba67a1..b533d45b1e 100644
--- a/test_runner/regress/test_compute_reconfigure.py
+++ b/test_runner/regress/test_compute_reconfigure.py
@@ -31,15 +31,17 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv):
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": True,
-            "cluster": {
-                "settings": [
-                    {
-                        "name": "log_line_prefix",
-                        "vartype": "string",
-                        "value": TEST_LOG_LINE_PREFIX,
-                    }
-                ]
+            "spec": {
+                "skip_pg_catalog_updates": True,
+                "cluster": {
+                    "settings": [
+                        {
+                            "name": "log_line_prefix",
+                            "vartype": "string",
+                            "value": TEST_LOG_LINE_PREFIX,
+                        }
+                    ]
+                },
             },
         }
     )
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 6175643389..83bebc19be 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -251,7 +251,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
     NUMBER_OF_DBS = 5
 
     # Create and start endpoint so that neon_local put all the generated
-    # stuff into the spec.json file.
+    # stuff into the config.json file.
     endpoint = env.endpoints.create_start(
         "main",
         config_lines=[
@@ -280,13 +280,15 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
             }
         )
 
-    # Update the spec.json file to create the databases
+    # Update the config.json file to create the databases
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )

From fd16caa7d08be31962cb29fb8eeefb65cb114464 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 11 Apr 2025 17:09:28 +0200
Subject: [PATCH 116/140] pageserver: yield for L0 during ancestor compaction
 (#11536)

## Problem

Shard ancestor compaction does not yield for L0 compaction, potentially
starving it.

close https://github.com/neondatabase/neon/issues/11125

## Summary of changes

* Yield for L0 during shard ancestor compaction.
* Return `CompactionOutcome::Pending` when limited by `rewrite_max`, for
eager rescheduling.
---
 pageserver/src/tenant/timeline/compaction.rs | 66 +++++++++++++++-----
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index c6f0e32494..a559c7fdec 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1119,7 +1119,17 @@ impl Timeline {
             // being potentially much longer.
             let rewrite_max = partition_count;
 
-            self.compact_shard_ancestors(rewrite_max, ctx).await?;
+            let outcome = self
+                .compact_shard_ancestors(
+                    rewrite_max,
+                    options.flags.contains(CompactFlags::YieldForL0),
+                    ctx,
+                )
+                .await?;
+            match outcome {
+                CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome),
+                CompactionOutcome::Done | CompactionOutcome::Skipped => {}
+            }
         }
 
         Ok(CompactionOutcome::Done)
@@ -1136,11 +1146,12 @@ impl Timeline {
     async fn compact_shard_ancestors(
         self: &Arc<Self>,
         rewrite_max: usize,
+        yield_for_l0: bool,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
+        let mut outcome = CompactionOutcome::Done;
         let mut drop_layers = Vec::new();
         let mut layers_to_rewrite: Vec<Layer> = Vec::new();
-        let mut rewrite_max_exceeded: bool = false;
 
         // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
         // layer is behind this Lsn, it indicates that the layer is being retained beyond the
@@ -1233,8 +1244,8 @@ impl Timeline {
                 debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
                     layers_to_rewrite.len()
                 );
-                rewrite_max_exceeded = true;
-                continue;
+                outcome = CompactionOutcome::Pending;
+                break;
             }
 
             // Fall through: all our conditions for doing a rewrite passed.
@@ -1246,7 +1257,7 @@ impl Timeline {
 
         // Drop out early if there's nothing to do.
         if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
         }
 
         info!(
@@ -1314,6 +1325,20 @@ impl Timeline {
                 // the layer has no data for us with the ShardedRange check above, but
                 drop_layers.push(layer);
             }
+
+            // Yield for L0 compaction if necessary, but make sure we update the layer map below
+            // with the work we've already done.
+            if yield_for_l0
+                && self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some()
+            {
+                info!("shard ancestor compaction yielding for L0 compaction");
+                outcome = CompactionOutcome::YieldForL0;
+                break;
+            }
         }
 
         for layer in &drop_layers {
@@ -1337,27 +1362,36 @@ impl Timeline {
         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
         // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
         // load.
-        info!("shard ancestor compaction waiting for uploads");
-        match self.remote_client.wait_completion().await {
-            Ok(()) => (),
-            Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
-            Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                return Err(CompactionError::ShuttingDown);
+        if outcome != CompactionOutcome::YieldForL0 {
+            info!("shard ancestor compaction waiting for uploads");
+            tokio::select! {
+                result = self.remote_client.wait_completion() => match result {
+                    Ok(()) => {},
+                    Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
+                    Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
+                        return Err(CompactionError::ShuttingDown);
+                    }
+                },
+                // Don't wait if there's L0 compaction to do. We don't need to update the outcome
+                // here, because we've already done the actual work.
+                _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {},
             }
         }
 
         info!(
             "shard ancestor compaction done in {:.3}s{}",
             started.elapsed().as_secs_f64(),
-            match rewrite_max_exceeded {
-                true => format!(", more work pending due to rewrite_max={rewrite_max}"),
-                false => String::new(),
+            match outcome {
+                CompactionOutcome::Pending =>
+                    format!(", with pending work (rewrite_max={rewrite_max})"),
+                CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"),
+                CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(),
             }
         );
 
         fail::fail_point!("compact-shard-ancestors-persistent");
 
-        Ok(())
+        Ok(outcome)
     }
 
     /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is

From 66f56ddaec25dae216b2c5ebb13822d201ab61a7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 11 Apr 2025 11:20:51 -0400
Subject: [PATCH 117/140] fix(pageserver): allow shutdown errors for gc
 compaction tests (#11530)

## Problem

`test_pageserver_compaction_preempt` is flaky.

## Summary of changes

Allow the shutdown errors.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 087fafb327..84d37de9f1 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -162,6 +162,8 @@ def test_pageserver_compaction_preempt(
     conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
     env = neon_env_builder.init_start(initial_tenant_conf=conf)
 
+    env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 

From 4f7b2cdd4f543b381a125ef70a3a07d20d3c34c7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 11 Apr 2025 11:50:29 -0400
Subject: [PATCH 118/140] feat(pageserver): gc-compaction result verification
 (#11515)

## Problem

Part of #9114

There was a debug-mode verification mode that verifies at every
retain_lsn. However, the code was tangled within the actual history
generation itself and it's hard to reason about correctness. This patch
adds a separate post-verification of the gc-compaction result that redos
logs at every retain_lsn and every record above the GC horizon. This
ensures that all key history we produce with gc-compaction is readable,
and if there're read errors after gc-compaction, it can only be
read-path errors instead of gc-compaction bugs.

## Summary of changes

* Add gc_compaction_verification flag, default to true.
* Implement a post-verification process.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |   5 +
 libs/pageserver_api/src/config.rs             |   4 +
 libs/pageserver_api/src/models.rs             |  13 ++
 pageserver/src/tenant.rs                      |  13 +-
 pageserver/src/tenant/timeline.rs             |   5 +
 pageserver/src/tenant/timeline/compaction.rs  | 143 ++++++++++++++++--
 .../regress/test_attach_tenant_config.py      |   1 +
 7 files changed, 170 insertions(+), 14 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 591eb3728b..5c985e6dc8 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -535,6 +535,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'gc_compaction_enabled' as bool")?,
+            gc_compaction_verification: settings
+                .remove("gc_compaction_verification")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_verification' as bool")?,
             gc_compaction_initial_threshold_kb: settings
                 .remove("gc_compaction_initial_threshold_kb")
                 .map(|x| x.parse::<u64>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index bd9f7efb7f..3820022011 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -452,6 +452,8 @@ pub struct TenantConfigToml {
     // gc-compaction related configs
     /// Enable automatic gc-compaction trigger on this tenant.
     pub gc_compaction_enabled: bool,
+    /// Enable verification of gc-compaction results.
+    pub gc_compaction_verification: bool,
     /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
     /// gc-compaction will be triggered.
     pub gc_compaction_initial_threshold_kb: u64,
@@ -692,6 +694,7 @@ pub mod tenant_conf_defaults {
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
     pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
     pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
     pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }
@@ -746,6 +749,7 @@ impl Default for TenantConfigToml {
             wal_receiver_protocol_override: None,
             rel_size_v2_enabled: false,
             gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
+            gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
             gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
             gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
             sampling_ratio: None,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 34a419f2cf..f491ed10e1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -576,6 +576,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_enabled: FieldPatch<bool>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_verification: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_ratio_percent: FieldPatch<u64>,
@@ -696,6 +698,9 @@ pub struct TenantConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_compaction_enabled: Option<bool>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_verification: Option<bool>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_compaction_initial_threshold_kb: Option<u64>,
 
@@ -744,6 +749,7 @@ impl TenantConfig {
             mut wal_receiver_protocol_override,
             mut rel_size_v2_enabled,
             mut gc_compaction_enabled,
+            mut gc_compaction_verification,
             mut gc_compaction_initial_threshold_kb,
             mut gc_compaction_ratio_percent,
             mut sampling_ratio,
@@ -835,6 +841,9 @@ impl TenantConfig {
         patch
             .gc_compaction_enabled
             .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_verification
+            .apply(&mut gc_compaction_verification);
         patch
             .gc_compaction_initial_threshold_kb
             .apply(&mut gc_compaction_initial_threshold_kb);
@@ -876,6 +885,7 @@ impl TenantConfig {
             wal_receiver_protocol_override,
             rel_size_v2_enabled,
             gc_compaction_enabled,
+            gc_compaction_verification,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
             sampling_ratio,
@@ -974,6 +984,9 @@ impl TenantConfig {
             gc_compaction_enabled: self
                 .gc_compaction_enabled
                 .unwrap_or(global_conf.gc_compaction_enabled),
+            gc_compaction_verification: self
+                .gc_compaction_verification
+                .unwrap_or(global_conf.gc_compaction_verification),
             gc_compaction_initial_threshold_kb: self
                 .gc_compaction_initial_threshold_kb
                 .unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ad4a0d804d..4e3d849032 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -9257,6 +9257,7 @@ mod tests {
                 &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                 3,
                 None,
+                true,
             )
             .await
             .unwrap();
@@ -9381,7 +9382,15 @@ mod tests {
             ),
         ];
         let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[Lsn(0x40), Lsn(0x50)],
+                3,
+                None,
+                true,
+            )
             .await
             .unwrap();
         let expected_res = KeyHistoryRetention {
@@ -9460,6 +9469,7 @@ mod tests {
                 &[],
                 3,
                 Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+                true,
             )
             .await
             .unwrap();
@@ -9508,6 +9518,7 @@ mod tests {
                 &[Lsn(0x30)],
                 3,
                 Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+                true,
             )
             .await
             .unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8a4a6f4b40..67a16db040 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2723,6 +2723,10 @@ impl Timeline {
             .tenant_conf
             .gc_compaction_enabled
             .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled);
+        let gc_compaction_verification = tenant_conf
+            .tenant_conf
+            .gc_compaction_verification
+            .unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification);
         let gc_compaction_initial_threshold_kb = tenant_conf
             .tenant_conf
             .gc_compaction_initial_threshold_kb
@@ -2737,6 +2741,7 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent);
         GcCompactionCombinedSettings {
             gc_compaction_enabled,
+            gc_compaction_verification,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
         }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a559c7fdec..e3aa5045bb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -80,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId {
 
 pub struct GcCompactionCombinedSettings {
     pub gc_compaction_enabled: bool,
+    pub gc_compaction_verification: bool,
     pub gc_compaction_initial_threshold_kb: u64,
     pub gc_compaction_ratio_percent: u64,
 }
@@ -225,6 +226,7 @@ impl GcCompactionQueue {
             gc_compaction_enabled,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
+            ..
         } = timeline.get_gc_compaction_settings();
         if !gc_compaction_enabled {
             return Ok(());
@@ -788,6 +790,114 @@ impl KeyHistoryRetention {
         }
         Ok(())
     }
+
+    /// Verify if every key in the retention is readable by replaying the logs.
+    async fn verify(
+        &self,
+        key: Key,
+        base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>,
+        full_history: &[(Key, Lsn, Value)],
+        tline: &Arc<Timeline>,
+    ) -> anyhow::Result<()> {
+        // Usually the min_lsn should be the first record but we do a full iteration to be safe.
+        let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let mut base_img = base_img_from_ancestor
+            .as_ref()
+            .map(|(_, lsn, img)| (*lsn, img));
+        let mut history = Vec::new();
+
+        async fn collect_and_verify(
+            key: Key,
+            lsn: Lsn,
+            base_img: &Option<(Lsn, &Bytes)>,
+            history: &[(Lsn, &NeonWalRecord)],
+            tline: &Arc<Timeline>,
+        ) -> anyhow::Result<()> {
+            let mut records = history
+                .iter()
+                .map(|(lsn, val)| (*lsn, (*val).clone()))
+                .collect::<Vec<_>>();
+
+            // WAL redo requires records in the reverse LSN order
+            records.reverse();
+            let data = ValueReconstructState {
+                img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())),
+                records,
+            };
+
+            tline
+                .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
+                .await
+                .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
+
+            Ok(())
+        }
+
+        for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon {
+            for (lsn, val) in logs {
+                match val {
+                    Value::Image(img) => {
+                        base_img = Some((*lsn, img));
+                        history.clear();
+                    }
+                    Value::WalRecord(rec) if val.will_init() => {
+                        base_img = None;
+                        history.clear();
+                        history.push((*lsn, rec));
+                    }
+                    Value::WalRecord(rec) => {
+                        history.push((*lsn, rec));
+                    }
+                }
+            }
+            if *retain_lsn >= min_lsn {
+                // Only verify after the key appears in the full history for the first time.
+
+                if base_img.is_none() && history.is_empty() {
+                    anyhow::bail!(
+                        "verificatoin failed: key {} has no history at {}",
+                        key,
+                        retain_lsn
+                    );
+                };
+                // We don't modify history: in theory, we could replace the history with a single
+                // image as in `generate_key_retention` to make redos at later LSNs faster. But we
+                // want to verify everything as if they are read from the real layer map.
+                collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?;
+            }
+        }
+
+        for (lsn, val) in &self.above_horizon.0 {
+            match val {
+                Value::Image(img) => {
+                    // Above the GC horizon, we verify every time we see an image.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
+                    base_img = Some((*lsn, img));
+                    history.clear();
+                }
+                Value::WalRecord(rec) if val.will_init() => {
+                    // Above the GC horizon, we verify every time we see an init record.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
+                    base_img = None;
+                    history.clear();
+                    history.push((*lsn, rec));
+                }
+                Value::WalRecord(rec) => {
+                    history.push((*lsn, rec));
+                }
+            }
+        }
+        // Ensure the latest record is readable.
+        collect_and_verify(key, max_lsn, &base_img, &history, tline).await?;
+        Ok(())
+    }
 }
 
 #[derive(Debug, Serialize, Default)]
@@ -2210,6 +2320,7 @@ impl Timeline {
     /// ```
     ///
     /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
+    #[allow(clippy::too_many_arguments)]
     pub(crate) async fn generate_key_retention(
         self: &Arc<Timeline>,
         key: Key,
@@ -2218,6 +2329,7 @@ impl Timeline {
         retain_lsn_below_horizon: &[Lsn],
         delta_threshold_cnt: usize,
         base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
+        verification: bool,
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
 
@@ -2304,8 +2416,8 @@ impl Timeline {
             "should have at least below + above horizon batches"
         );
         let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
-        if let Some((key, lsn, img)) = base_img_from_ancestor {
-            replay_history.push((key, lsn, Value::Image(img)));
+        if let Some((key, lsn, ref img)) = base_img_from_ancestor {
+            replay_history.push((key, lsn, Value::Image(img.clone())));
         }
 
         /// Generate debug information for the replay history
@@ -2419,22 +2531,15 @@ impl Timeline {
             // Whether to reconstruct the image. In debug mode, we will generate an image
             // at every retain_lsn to ensure data is not corrupted, but we won't put the
             // image into the final layer.
-            let generate_image = produce_image || debug_mode;
-            if produce_image {
+            let img_and_lsn = if produce_image {
                 records_since_last_image = 0;
-            }
-            let img_and_lsn = if generate_image {
                 let replay_history_for_debug = if debug_mode {
                     Some(replay_history.clone())
                 } else {
                     None
                 };
                 let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
-                let history = if produce_image {
-                    std::mem::take(&mut replay_history)
-                } else {
-                    replay_history.clone()
-                };
+                let history = std::mem::take(&mut replay_history);
                 let mut img = None;
                 let mut records = Vec::with_capacity(history.len());
                 if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2469,6 +2574,7 @@ impl Timeline {
                         records.push((lsn, rec));
                     }
                 }
+                // WAL redo requires records in the reverse LSN order
                 records.reverse();
                 let state = ValueReconstructState { img, records };
                 // last batch does not generate image so i is always in range, unless we force generate
@@ -2501,10 +2607,16 @@ impl Timeline {
         assert_eq!(retention.len(), lsn_split_points.len() + 1);
         for (idx, logs) in retention.into_iter().enumerate() {
             if idx == lsn_split_points.len() {
-                return Ok(KeyHistoryRetention {
+                let retention = KeyHistoryRetention {
                     below_horizon: result,
                     above_horizon: KeyLogAtLsn(logs),
-                });
+                };
+                if verification {
+                    retention
+                        .verify(key, &base_img_from_ancestor, full_history, self)
+                        .await?;
+                }
+                return Ok(retention);
             } else {
                 result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
             }
@@ -2971,6 +3083,9 @@ impl Timeline {
             }
             (false, res)
         };
+
+        let verification = self.get_gc_compaction_settings().gc_compaction_verification;
+
         info!(
             "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
             job_desc.selected_layers.len(),
@@ -3287,6 +3402,7 @@ impl Timeline {
                             .await
                             .context("failed to get ancestor image")
                             .map_err(CompactionError::Other)?,
+                        verification,
                     )
                     .await
                     .context("failed to generate key retention")
@@ -3327,6 +3443,7 @@ impl Timeline {
                     .await
                     .context("failed to get ancestor image")
                     .map_err(CompactionError::Other)?,
+                verification,
             )
             .await
             .context("failed to generate key retention")
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 5021cc4b17..9b6930695c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         },
         "rel_size_v2_enabled": False,  # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it
         "gc_compaction_enabled": True,
+        "gc_compaction_verification": False,
         "gc_compaction_initial_threshold_kb": 1024000,
         "gc_compaction_ratio_percent": 200,
         "image_creation_preempt_threshold": 5,

From d109bf8c1d4a0837639837957917ab79106271af Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Fri, 11 Apr 2025 21:49:15 +0400
Subject: [PATCH 119/140] neon_local: use ed25519 to gen local ssl certs
 (#11542)

## Problem
neon_local uses rsa to generate local SSL certs, which is slow
Follow-up on:
- https://github.com/neondatabase/neon/pull/11025#discussion_r1989453785
- https://github.com/neondatabase/neon/pull/11538

## Summary of changes
- Change key from rsa to ed25519 in neon_local
---
 control_plane/src/local_env.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 8e2a110366..fa10abe91a 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -980,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
     // -out rootCA.crt -keyout rootCA.key
     let keygen_output = Command::new("openssl")
         .args([
-            "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
+            "req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
         ])
         .args(["-subj", "/CN=Neon Local CA"])
         .args(["-out", cert_path.to_str().unwrap()])
@@ -1010,7 +1010,7 @@ fn generate_ssl_cert(
     // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
     let keygen_output = Command::new("openssl")
         .args(["req", "-new", "-nodes"])
-        .args(["-newkey", "rsa:2048"])
+        .args(["-newkey", "ed25519"])
         .args(["-subj", "/CN=localhost"])
         .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
         .args(["-keyout", key_path.to_str().unwrap()])

From 946e971df8550ffd56c31584bc1cd372ab88599c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sat, 12 Apr 2025 10:16:22 +0100
Subject: [PATCH 120/140] feat(proxy): add batching to cancellation queue
 processing (#10607)

Add batching to the redis queue, which allows us to clear it out quicker
should it slow down temporarily.
---
 proxy/src/binary/proxy.rs |   9 +-
 proxy/src/cancellation.rs | 220 +++++++++++++++++++++++++++-----------
 proxy/src/redis/kv_ops.rs | 172 +++++------------------------
 3 files changed, 192 insertions(+), 209 deletions(-)

diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index 62fdc18207..e03f2f33d9 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -509,7 +509,14 @@ pub async fn run() -> anyhow::Result<()> {
             if let Some(mut redis_kv_client) = redis_kv_client {
                 maintenance_tasks.spawn(async move {
                     redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
+                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
+
+                    drop(redis_kv_client);
+
+                    // `handle_cancel_messages` was terminated due to the tx_cancel
+                    // being dropped. this is not worthy of an error, and this task can only return `Err`,
+                    // so let's wait forever instead.
+                    std::future::pending().await
                 });
             }
 
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index d6a7406f67..c5ba04eb8c 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,16 +1,17 @@
-use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
+use anyhow::{Context, anyhow};
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::CancelToken;
 use postgres_client::tls::MakeTlsConnect;
 use pq_proto::CancelKeyData;
+use redis::{FromRedisValue, Pipeline, Value, pipe};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
-use tracing::{debug, info};
+use tracing::{debug, info, warn};
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{AuthError, check_peer_addr_is_in_list};
@@ -30,6 +31,7 @@ type IpSubnetKey = IpNet;
 
 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
 const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
+const BATCH_SIZE: usize = 8;
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -54,78 +56,168 @@ pub enum CancelKeyOp {
     },
 }
 
+impl CancelKeyOp {
+    fn register(self, pipe: &mut Pipeline) -> Option<CancelReplyOp> {
+        #[allow(clippy::used_underscore_binding)]
+        match self {
+            CancelKeyOp::StoreCancelKey {
+                key,
+                field,
+                value,
+                resp_tx,
+                _guard,
+                expire,
+            } => {
+                pipe.hset(&key, field, value);
+                pipe.expire(key, expire);
+                let resp_tx = resp_tx?;
+                Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard })
+            }
+            CancelKeyOp::GetCancelData {
+                key,
+                resp_tx,
+                _guard,
+            } => {
+                pipe.hgetall(key);
+                Some(CancelReplyOp::GetCancelData { resp_tx, _guard })
+            }
+            CancelKeyOp::RemoveCancelKey {
+                key,
+                field,
+                resp_tx,
+                _guard,
+            } => {
+                pipe.hdel(key, field);
+                let resp_tx = resp_tx?;
+                Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard })
+            }
+        }
+    }
+}
+
+// Message types for sending through mpsc channel
+pub enum CancelReplyOp {
+    StoreCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    GetCancelData {
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+}
+
+impl CancelReplyOp {
+    fn send_err(self, e: anyhow::Error) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+
+    fn send_value(self, v: redis::Value) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+}
+
 // Running as a separate task to accept messages through the rx channel
-// In case of problems with RTT: switch to recv_many() + redis pipeline
 pub async fn handle_cancel_messages(
     client: &mut RedisKVClient,
     mut rx: mpsc::Receiver<CancelKeyOp>,
-) -> anyhow::Result<Infallible> {
+) -> anyhow::Result<()> {
+    let mut batch = Vec::new();
+    let mut replies = vec![];
+
     loop {
-        if let Some(msg) = rx.recv().await {
-            match msg {
-                CancelKeyOp::StoreCancelKey {
-                    key,
-                    field,
-                    value,
-                    resp_tx,
-                    _guard,
-                    expire,
-                } => {
-                    let res = client.hset(&key, field, value).await;
-                    if let Some(resp_tx) = resp_tx {
-                        if res.is_ok() {
-                            resp_tx
-                                .send(client.expire(key, expire).await)
-                                .inspect_err(|e| {
-                                    tracing::debug!(
-                                        "failed to send StoreCancelKey response: {:?}",
-                                        e
-                                    );
-                                })
-                                .ok();
-                        } else {
-                            resp_tx
-                                .send(res)
-                                .inspect_err(|e| {
-                                    tracing::debug!(
-                                        "failed to send StoreCancelKey response: {:?}",
-                                        e
-                                    );
-                                })
-                                .ok();
-                        }
-                    } else if res.is_ok() {
-                        drop(client.expire(key, expire).await);
-                    } else {
-                        tracing::warn!("failed to store cancel key: {:?}", res);
-                    }
+        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
+            warn!("shutting down cancellation queue");
+            break Ok(());
+        }
+
+        let batch_size = batch.len();
+        debug!(batch_size, "running cancellation jobs");
+
+        let mut pipe = pipe();
+        for msg in batch.drain(..) {
+            if let Some(reply) = msg.register(&mut pipe) {
+                replies.push(reply);
+            } else {
+                pipe.ignore();
+            }
+        }
+
+        let responses = replies.len();
+
+        match client.query(pipe).await {
+            // for each reply, we expect that many values.
+            Ok(Value::Array(values)) if values.len() == responses => {
+                debug!(
+                    batch_size,
+                    responses, "successfully completed cancellation jobs",
+                );
+                for (value, reply) in std::iter::zip(values, replies.drain(..)) {
+                    reply.send_value(value);
                 }
-                CancelKeyOp::GetCancelData {
-                    key,
-                    resp_tx,
-                    _guard,
-                } => {
-                    drop(resp_tx.send(client.hget_all(key).await));
+            }
+            Ok(value) => {
+                debug!(?value, "unexpected redis return value");
+                for reply in replies.drain(..) {
+                    reply.send_err(anyhow!("incorrect response type from redis"));
                 }
-                CancelKeyOp::RemoveCancelKey {
-                    key,
-                    field,
-                    resp_tx,
-                    _guard,
-                } => {
-                    if let Some(resp_tx) = resp_tx {
-                        resp_tx
-                            .send(client.hdel(key, field).await)
-                            .inspect_err(|e| {
-                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
-                            })
-                            .ok();
-                    } else {
-                        drop(client.hdel(key, field).await);
-                    }
+            }
+            Err(err) => {
+                for reply in replies.drain(..) {
+                    reply.send_err(anyhow!("could not send cmd to redis: {err}"));
                 }
             }
         }
+
+        replies.clear();
     }
 }
 
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index 3689bf7ae2..aa627b29a6 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,4 +1,5 @@
-use redis::{AsyncCommands, ToRedisArgs};
+use redis::aio::ConnectionLike;
+use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
@@ -8,6 +9,23 @@ pub struct RedisKVClient {
     limiter: GlobalRateLimiter,
 }
 
+#[allow(async_fn_in_trait)]
+pub trait Queryable {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T>;
+}
+
+impl Queryable for Pipeline {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
+        self.query_async(conn).await
+    }
+}
+
+impl Queryable for Cmd {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
+        self.query_async(conn).await
+    }
+}
+
 impl RedisKVClient {
     pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
         Self {
@@ -27,158 +45,24 @@ impl RedisKVClient {
         Ok(())
     }
 
-    pub(crate) async fn hset<K, F, V>(&mut self, key: K, field: F, value: V) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-        V: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hset");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hset(&key, &field, &value).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hset(key, field, value)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn hset_multiple<K, V>(
+    pub(crate) async fn query<T: FromRedisValue>(
         &mut self,
-        key: &str,
-        items: &[(K, V)],
-    ) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        V: ToRedisArgs + Send + Sync,
-    {
+        q: impl Queryable,
+    ) -> anyhow::Result<T> {
         if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hset_multiple");
+            tracing::info!("Rate limit exceeded. Skipping query");
             return Err(anyhow::anyhow!("Rate limit exceeded"));
         }
 
-        match self.client.hset_multiple(key, items).await {
-            Ok(()) => return Ok(()),
+        match q.query(&mut self.client).await {
+            Ok(t) => return Ok(t),
             Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
+                tracing::error!("failed to run query: {e}");
             }
         }
 
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        tracing::info!("Redis client is disconnected. Reconnecting...");
         self.try_connect().await?;
-        self.client
-            .hset_multiple(key, items)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn expire<K>(&mut self, key: K, seconds: i64) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping expire");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.expire(&key, seconds).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .expire(key, seconds)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn hget<K, F, V>(&mut self, key: K, field: F) -> anyhow::Result<V>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-        V: redis::FromRedisValue,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hget");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hget(&key, &field).await {
-            Ok(value) => return Ok(value),
-            Err(e) => {
-                tracing::error!("failed to get a value: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hget(key, field)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    pub(crate) async fn hget_all<K, V>(&mut self, key: K) -> anyhow::Result<V>
-    where
-        K: ToRedisArgs + Send + Sync,
-        V: redis::FromRedisValue,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hgetall");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hgetall(&key).await {
-            Ok(value) => return Ok(value),
-            Err(e) => {
-                tracing::error!("failed to get a value: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client.hgetall(key).await.map_err(anyhow::Error::new)
-    }
-
-    pub(crate) async fn hdel<K, F>(&mut self, key: K, field: F) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hdel");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hdel(&key, &field).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to delete a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hdel(key, field)
-            .await
-            .map_err(anyhow::Error::new)
+        Ok(q.query(&mut self.client).await?)
     }
 }

From 8936a7abd85a28786f8f9b85f778768ceddc2bf2 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 12 Apr 2025 21:09:12 +0300
Subject: [PATCH 121/140] Increase limit for worker processes for isolation
 test (#11504)

## Problem

See https://github.com/neondatabase/neon/issues/10652

Neon extension launches 2 BGW which reduce limit for parallel workers
and so affecting parallel_deadlock isolation test.

## Summary of changes

Increase `max_worker_processes` from default 8 to 16 for isolation test.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_pg_regress.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index a3fae97327..0fea706888 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -239,6 +239,8 @@ def test_isolation(
             "neon.regress_test_mode = true",
             # Stack size should be increased for tests to pass with asan.
             "max_stack_depth = 4MB",
+            # Neon extensiosn starts 2 BGW so decreasing number of parallel workers which can affect deadlock-parallel test if it hits max_worker_processes.
+            "max_worker_processes = 16",
         ],
     )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

From a338984dc77491c90a93076c0757524e99d644c0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Apr 2025 10:05:29 +0100
Subject: [PATCH 122/140] pageserver: support keys at different LSNs in one get
 page batch  (#11494)

## Problem

Get page batching stops when we encounter requests at different LSNs.
We are leaving batching factor on the table.

## Summary of changes

The goal is to support keys with different LSNs in a single batch and
still serve them with a single vectored get.
Important restriction: the same key at different LSNs is not supported
in one batch. Returning different key
versions is a much more intrusive change.

Firstly, the read path is changed to support "scattered" queries. This
is a conceptually simple step from
https://github.com/neondatabase/neon/pull/11463. Instead of initializing
the fringe for one keyspace,
we do it for multiple at different LSNs and let the logic already
present into the fringe handle selection.

Secondly, page service code is updated to support batching at different
LSNs. Eeach request parsed from the wire determines its effective
request LSN and keeps it in mem for the batcher toinspect. The batcher
allows keys at
different LSNs in one batch as long one key is not requested at
different LSNs.

I'd suggest doing the first pass commit by commit to get a feel for the
changes.

## Results

I used the batching test from [Christian's
PR](https://github.com/neondatabase/neon/pull/11391) which increases the
change of batch breaks. Looking at the logs I think the new code is at
the max batching factor for the workload (we
only break batches due to them being oversized or because the executor
is idle).

```
Main:
Reasons for stopping batching: {'LSN changed': 22843, 'of batch size': 33417}
test_throughput[release-pg16-50-pipelining_config0-30-100-128-batchable {'max_batch_size': 32, 'execution': 'concurrent-futures', 'mode': 'pipelined'}].perfmetric.batching_factor: 14.6662

My branch:
Reasons for stopping batching: {'of batch size': 37024}
test_throughput[release-pg16-50-pipelining_config0-30-100-128-batchable {'max_batch_size': 32, 'execution': 'concurrent-futures', 'mode': 'pipelined'}].perfmetric.batching_factor: 19.8333
```

Related: https://github.com/neondatabase/neon/issues/10765
---
 libs/pageserver_api/src/config.rs             |  20 +
 pageserver/src/basebackup.rs                  |   5 +-
 pageserver/src/metrics.rs                     |  14 +-
 pageserver/src/page_service.rs                | 206 +++++++---
 pageserver/src/pgdatadir_mapping.rs           |  98 ++---
 pageserver/src/tenant.rs                      |  85 ++--
 pageserver/src/tenant/storage_layer.rs        |  28 +-
 pageserver/src/tenant/timeline.rs             | 368 +++++++++++++-----
 .../src/tenant/timeline/detach_ancestor.rs    |   9 +-
 test_runner/fixtures/neon_fixtures.py         |   5 +
 .../pageserver/test_page_service_batching.py  |  17 +-
 .../test_page_service_batching_regressions.py |   1 +
 12 files changed, 591 insertions(+), 265 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 3820022011..53b68afb0f 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -207,6 +207,10 @@ pub struct PageServicePipeliningConfigPipelined {
     /// Causes runtime errors if larger than max get_vectored batch size.
     pub max_batch_size: NonZeroUsize,
     pub execution: PageServiceProtocolPipelinedExecutionStrategy,
+    // The default below is such that new versions of the software can start
+    // with the old configuration.
+    #[serde(default)]
+    pub batching: PageServiceProtocolPipelinedBatchingStrategy,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -216,6 +220,19 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
     Tasks,
 }
 
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum PageServiceProtocolPipelinedBatchingStrategy {
+    /// All get page requests in a batch will be at the same LSN
+    #[default]
+    UniformLsn,
+    /// Get page requests in a batch may be at different LSN
+    ///
+    /// One key cannot be present more than once at different LSNs in
+    /// the same batch.
+    ScatteredLsn,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
@@ -615,9 +632,12 @@ impl Default for ConfigToml {
             page_service_pipelining: if !cfg!(test) {
                 PageServicePipeliningConfig::Serial
             } else {
+                // Do not turn this into the default until scattered reads have been
+                // validated and rolled-out fully.
                 PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
                     max_batch_size: NonZeroUsize::new(32).unwrap(),
                     execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
+                    batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
                 })
             },
             get_vectored_concurrent_io: if !cfg!(test) {
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index de527e307b..3510ccb529 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
 use crate::tenant::{PageReconstructError, Timeline};
 
 #[derive(Debug, thiserror::Error)]
@@ -353,9 +353,10 @@ where
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
             for part in slru_partitions.parts {
+                let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
                 let blocks = self
                     .timeline
-                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
+                    .get_vectored(query, self.io_concurrency.clone(), self.ctx)
                     .await?;
 
                 for (key, block) in blocks {
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1fe51021fd..59bb3410f9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -17,7 +17,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -1863,7 +1863,7 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
         "pageserver_page_service_config_max_batch_size",
         "Configured maximum batch size for the server-side batching functionality of page_service. \
          Labels expose more of the configuration parameters.",
-        &["mode", "execution"]
+        &["mode", "execution", "batching"]
     )
     .expect("failed to define a metric")
 });
@@ -1871,10 +1871,11 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
 fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
     PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
     let (label_values, value) = match conf {
-        PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
+        PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
         PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
             max_batch_size,
             execution,
+            batching,
         }) => {
             let mode = "pipelined";
             let execution = match execution {
@@ -1883,7 +1884,12 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
                 }
                 PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
             };
-            ([mode, execution], max_batch_size.get())
+            let batching = match batching {
+                PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
+                PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
+            };
+
+            ([mode, execution, batching], max_batch_size.get())
         }
     };
     PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 61f524fc29..26eea5183b 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
@@ -641,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
     req: PagestreamGetPageRequest,
     timer: SmgrOpTimer,
+    effective_request_lsn: Lsn,
     ctx: RequestContext,
 }
 
@@ -670,7 +671,6 @@ enum BatchedFeMessage {
     GetPage {
         span: Span,
         shard: timeline::handle::WeakHandle<TenantManagerTypes>,
-        effective_request_lsn: Lsn,
         pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
     },
     DbSize {
@@ -1025,34 +1025,28 @@ impl PageServerHandler {
                 .await?;
 
                 // We're holding the Handle
-                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                let res = Self::wait_or_get_last_lsn(
+                let effective_request_lsn = match Self::effective_request_lsn(
                     &shard,
+                    shard.get_last_record_lsn(),
                     req.hdr.request_lsn,
                     req.hdr.not_modified_since,
                     &shard.get_applied_gc_cutoff_lsn(),
-                    &ctx,
-                )
-                .maybe_perf_instrument(&ctx, |current_perf_span| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        parent: current_perf_span,
-                        "WAIT_LSN",
-                    )
-                })
-                .await;
-
-                let effective_request_lsn = match res {
+                ) {
                     Ok(lsn) => lsn,
                     Err(e) => {
                         return respond_error!(span, e);
                     }
                 };
+
                 BatchedFeMessage::GetPage {
                     span,
                     shard: shard.downgrade(),
-                    effective_request_lsn,
-                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
+                    pages: smallvec::smallvec![BatchedGetPageRequest {
+                        req,
+                        timer,
+                        effective_request_lsn,
+                        ctx,
+                    }],
                 }
             }
             #[cfg(feature = "testing")]
@@ -1078,6 +1072,7 @@ impl PageServerHandler {
     #[instrument(skip_all, level = tracing::Level::TRACE)]
     #[allow(clippy::boxed_local)]
     fn pagestream_do_batch(
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
         max_batch_size: NonZeroUsize,
         batch: &mut Result<BatchedFeMessage, QueryError>,
         this_msg: Result<BatchedFeMessage, QueryError>,
@@ -1096,33 +1091,61 @@ impl PageServerHandler {
                     span: _,
                     shard: accum_shard,
                     pages: accum_pages,
-                    effective_request_lsn: accum_lsn,
                 }),
                 BatchedFeMessage::GetPage {
                     span: _,
                     shard: this_shard,
                     pages: this_pages,
-                    effective_request_lsn: this_lsn,
                 },
             ) if (|| {
                 assert_eq!(this_pages.len(), 1);
                 if accum_pages.len() >= max_batch_size.get() {
-                    trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
+                    trace!(%max_batch_size, "stopping batching because of batch size");
                     assert_eq!(accum_pages.len(), max_batch_size.get());
                     return false;
                 }
                 if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
+                    trace!("stopping batching because timeline object mismatch");
                     // TODO: we _could_ batch & execute each shard seperately (and in parallel).
                     // But the current logic for keeping responses in order does not support that.
                     return false;
                 }
-                // the vectored get currently only supports a single LSN, so, bounce as soon
-                // as the effective request_lsn changes
-                if *accum_lsn != this_lsn {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
-                    return false;
+
+                match batching_strategy {
+                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
+                        if let Some(last_in_batch) = accum_pages.last() {
+                            if last_in_batch.effective_request_lsn
+                                != this_pages[0].effective_request_lsn
+                            {
+                                return false;
+                            }
+                        }
+                    }
+                    PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
+                        // The read path doesn't curently support serving the same page at different LSNs.
+                        // While technically possible, it's uncertain if the complexity is worth it.
+                        // Break the batch if such a case is encountered.
+                        //
+                        // TODO(vlad): Include a metric for batch breaks with a reason label.
+                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
+                            batched.req.rel == this_pages[0].req.rel
+                                && batched.req.blkno == this_pages[0].req.blkno
+                                && batched.effective_request_lsn
+                                    != this_pages[0].effective_request_lsn
+                        });
+
+                        if same_page_different_lsn {
+                            trace!(
+                                rel=%this_pages[0].req.rel,
+                                blkno=%this_pages[0].req.blkno,
+                                lsn=%this_pages[0].effective_request_lsn,
+                                "stopping batching because same page was requested at different LSNs"
+                            );
+                            return false;
+                        }
+                    }
                 }
+
                 true
             })() =>
             {
@@ -1390,12 +1413,7 @@ impl PageServerHandler {
                     span,
                 )
             }
-            BatchedFeMessage::GetPage {
-                span,
-                shard,
-                effective_request_lsn,
-                pages,
-            } => {
+            BatchedFeMessage::GetPage { span, shard, pages } => {
                 fail::fail_point!("ps::handle-pagerequest-message::getpage");
                 let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                 (
@@ -1405,7 +1423,6 @@ impl PageServerHandler {
                         let res = self
                             .handle_get_page_at_lsn_request_batched(
                                 &shard,
-                                effective_request_lsn,
                                 pages,
                                 io_concurrency,
                                 &ctx,
@@ -1724,6 +1741,7 @@ impl PageServerHandler {
         let PageServicePipeliningConfigPipelined {
             max_batch_size,
             execution,
+            batching: batching_strategy,
         } = pipelining_config;
 
         // Macro to _define_ a pipeline stage.
@@ -1775,7 +1793,7 @@ impl PageServerHandler {
                     exit |= read_res.is_err();
                     let could_send = batch_tx
                         .send(read_res, |batch, res| {
-                            Self::pagestream_do_batch(max_batch_size, batch, res)
+                            Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
                         })
                         .await;
                     exit |= could_send.is_err();
@@ -1871,7 +1889,39 @@ impl PageServerHandler {
         ctx: &RequestContext,
     ) -> Result<Lsn, PageStreamError> {
         let last_record_lsn = timeline.get_last_record_lsn();
+        let effective_request_lsn = Self::effective_request_lsn(
+            timeline,
+            last_record_lsn,
+            request_lsn,
+            not_modified_since,
+            latest_gc_cutoff_lsn,
+        )?;
 
+        if effective_request_lsn > last_record_lsn {
+            timeline
+                .wait_lsn(
+                    not_modified_since,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    ctx,
+                )
+                .await?;
+
+            // Since we waited for 'effective_request_lsn' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+        }
+
+        Ok(effective_request_lsn)
+    }
+
+    fn effective_request_lsn(
+        timeline: &Timeline,
+        last_record_lsn: Lsn,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
+        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+    ) -> Result<Lsn, PageStreamError> {
         // Sanity check the request
         if request_lsn < not_modified_since {
             return Err(PageStreamError::BadRequest(
@@ -1906,19 +1956,7 @@ impl PageServerHandler {
             }
         }
 
-        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
         if not_modified_since > last_record_lsn {
-            timeline
-                .wait_lsn(
-                    not_modified_since,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    timeline::WaitLsnTimeout::Default,
-                    ctx,
-                )
-                .await?;
-            // Since we waited for 'not_modified_since' to arrive, that is now the last
-            // record LSN. (Or close enough for our purposes; the last-record LSN can
-            // advance immediately after we return anyway)
             Ok(not_modified_since)
         } else {
             // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
@@ -2073,7 +2111,6 @@ impl PageServerHandler {
     async fn handle_get_page_at_lsn_request_batched(
         &mut self,
         timeline: &Timeline,
-        effective_lsn: Lsn,
         requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
         io_concurrency: IoConcurrency,
         ctx: &RequestContext,
@@ -2092,20 +2129,81 @@ impl PageServerHandler {
                 // Ignore error (trace buffer may be full or tracer may have disconnected).
                 _ = page_trace.try_send(PageTraceEvent {
                     key,
-                    effective_lsn,
+                    effective_lsn: batch.effective_request_lsn,
                     time,
                 });
             }
         }
 
+        // If any request in the batch needs to wait for LSN, then do so now.
+        let mut perf_instrument = false;
+        let max_effective_lsn = requests
+            .iter()
+            .map(|req| {
+                if req.ctx.has_perf_span() {
+                    perf_instrument = true;
+                }
+
+                req.effective_request_lsn
+            })
+            .max()
+            .expect("batch is never empty");
+
+        let ctx = match perf_instrument {
+            true => RequestContextBuilder::from(ctx)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "GET_VECTORED",
+                        tenant_id = %timeline.tenant_shard_id.tenant_id,
+                        timeline_id = %timeline.timeline_id,
+                        shard = %timeline.tenant_shard_id.shard_slug(),
+                        %max_effective_lsn
+                    )
+                })
+                .attached_child(),
+            false => ctx.attached_child(),
+        };
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if max_effective_lsn > last_record_lsn {
+            if let Err(e) = timeline
+                .wait_lsn(
+                    max_effective_lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    &ctx,
+                )
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "WAIT_LSN",
+                    )
+                })
+                .await
+            {
+                return Vec::from_iter(requests.into_iter().map(|req| {
+                    Err(BatchedPageStreamError {
+                        err: PageStreamError::from(e.clone()),
+                        req: req.req.hdr,
+                    })
+                }));
+            }
+        }
+
         let results = timeline
             .get_rel_page_at_lsn_batched(
-                requests
-                    .iter()
-                    .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
-                effective_lsn,
+                requests.iter().map(|p| {
+                    (
+                        &p.req.rel,
+                        &p.req.blkno,
+                        p.effective_request_lsn,
+                        p.ctx.attached_child(),
+                    )
+                }),
                 io_concurrency,
-                ctx,
+                &ctx,
             )
             .await;
         assert_eq!(results.len(), requests.len());
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f33a8baec1..81e548a095 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,7 +6,7 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
-use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
+use std::collections::{HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};
 
 use crate::walingest::{WalIngestError, WalIngestErrorKind};
@@ -14,7 +14,6 @@ use crate::{PERF_TRACE_TARGET, ensure_walingest};
 use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
     TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
@@ -22,7 +21,7 @@ use pageserver_api::key::{
     repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
-use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -41,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
-use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
+use crate::context::{PerfInstrumentFutureExt, RequestContext};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
     RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
@@ -51,7 +50,7 @@ use crate::span::{
     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
 };
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -207,10 +206,9 @@ impl Timeline {
                 let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                 let res = self
                     .get_rel_page_at_lsn_batched(
-                        pages
-                            .iter()
-                            .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
-                        effective_lsn,
+                        pages.iter().map(|(tag, blknum)| {
+                            (tag, blknum, effective_lsn, ctx.attached_child())
+                        }),
                         io_concurrency.clone(),
                         ctx,
                     )
@@ -248,8 +246,7 @@ impl Timeline {
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
-        effective_lsn: Lsn,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
         io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -262,11 +259,13 @@ impl Timeline {
         let mut result = Vec::with_capacity(pages.len());
         let result_slots = result.spare_capacity_mut();
 
-        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
-            BTreeMap::default();
+        let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
+            HashMap::with_capacity(pages.len());
 
-        let mut perf_instrument = false;
-        for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
+        let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
+            HashMap::with_capacity(pages.len());
+
+        for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
             if tag.relnode == 0 {
                 result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                     RelationError::InvalidRelnode.into(),
@@ -277,14 +276,14 @@ impl Timeline {
             }
 
             let nblocks = match self
-                .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
+                .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
                 .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                     info_span!(
                         target: PERF_TRACE_TARGET,
                         parent: crnt_perf_span,
                         "GET_REL_SIZE",
                         reltag=%tag,
-                        lsn=%effective_lsn,
+                        lsn=%lsn,
                     )
                 })
                 .await
@@ -300,7 +299,7 @@ impl Timeline {
             if *blknum >= nblocks {
                 debug!(
                     "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                    tag, blknum, effective_lsn, nblocks
+                    tag, blknum, lsn, nblocks
                 );
                 result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
                 slots_filled += 1;
@@ -309,46 +308,29 @@ impl Timeline {
 
             let key = rel_block_to_key(*tag, *blknum);
 
-            if ctx.has_perf_span() {
-                perf_instrument = true;
-            }
-
             let key_slots = keys_slots.entry(key).or_default();
             key_slots.push((response_slot_idx, ctx));
+
+            let acc = req_keyspaces.entry(lsn).or_default();
+            acc.add_key(key);
         }
 
-        let keyspace = {
-            // add_key requires monotonicity
-            let mut acc = KeySpaceAccum::new();
-            for key in keys_slots
-                .keys()
-                // in fact it requires strong monotonicity
-                .dedup()
-            {
-                acc.add_key(*key);
-            }
-            acc.to_keyspace()
-        };
-
-        let ctx = match perf_instrument {
-            true => RequestContextBuilder::from(ctx)
-                .root_perf_span(|| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        "GET_VECTORED",
-                        tenant_id = %self.tenant_shard_id.tenant_id,
-                        timeline_id = %self.timeline_id,
-                        lsn = %effective_lsn,
-                        shard = %self.tenant_shard_id.shard_slug(),
-                    )
-                })
-                .attached_child(),
-            false => ctx.attached_child(),
-        };
+        let query: Vec<(Lsn, KeySpace)> = req_keyspaces
+            .into_iter()
+            .map(|(lsn, acc)| (lsn, acc.to_keyspace()))
+            .collect();
 
+        let query = VersionedKeySpaceQuery::scattered(query);
         let res = self
-            .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
-            .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
+            .get_vectored(query, io_concurrency, ctx)
+            .maybe_perf_instrument(ctx, |current_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: current_perf_span,
+                    "GET_BATCH",
+                    batch_size = %page_count,
+                )
+            })
             .await;
 
         match res {
@@ -378,12 +360,12 @@ impl Timeline {
                         // There is no standardized way to express that the batched span followed from N request spans.
                         // So, abuse the system and mark the request contexts as follows_from the batch span, so we get
                         // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
-                        req_ctx.perf_follows_from(&ctx);
+                        req_ctx.perf_follows_from(ctx);
                         slots_filled += 1;
                     }
 
                     result_slots[first_slot].write(res);
-                    first_req_ctx.perf_follows_from(&ctx);
+                    first_req_ctx.perf_follows_from(ctx);
                     slots_filled += 1;
                 }
             }
@@ -422,7 +404,7 @@ impl Timeline {
                         }
                     };
 
-                    req_ctx.perf_follows_from(&ctx);
+                    req_ctx.perf_follows_from(ctx);
                     result_slots[*slot].write(err);
                 }
 
@@ -661,8 +643,9 @@ impl Timeline {
 
         let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
         for batch in batches.parts {
+            let query = VersionedKeySpaceQuery::uniform(batch, lsn);
             let blocks = self
-                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .get_vectored(query, io_concurrency.clone(), ctx)
                 .await?;
 
             for (_key, block) in blocks {
@@ -899,8 +882,9 @@ impl Timeline {
             );
 
             for batch in batches.parts.into_iter().rev() {
+                let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
                 let blocks = self
-                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
+                    .get_vectored(query, io_concurrency.clone(), ctx)
                     .await?;
 
                 for (_key, clog_page) in blocks.into_iter().rev() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4e3d849032..f14c7608fd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5948,7 +5948,7 @@ mod tests {
     use timeline::InMemoryLayerTestDesc;
     #[cfg(feature = "testing")]
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
-    use timeline::{CompactOptions, DeltaLayerTestDesc};
+    use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
     use utils::id::TenantId;
 
     use super::*;
@@ -6786,10 +6786,11 @@ mod tests {
         for read in reads {
             info!("Doing vectored read on {:?}", read);
 
+            let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn);
+
             let vectored_res = tline
                 .get_vectored_impl(
-                    read.clone(),
-                    reads_lsn,
+                    query,
                     &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
@@ -6868,10 +6869,11 @@ mod tests {
         };
         let read_lsn = child_timeline.get_last_record_lsn();
 
+        let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn);
+
         let vectored_res = child_timeline
             .get_vectored_impl(
-                aux_keyspace.clone(),
-                read_lsn,
+                query,
                 &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
@@ -7017,10 +7019,12 @@ mod tests {
         let read = KeySpace {
             ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
         };
+
+        let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn);
+
         let results = child_timeline
             .get_vectored_impl(
-                read.clone(),
-                current_lsn,
+                query,
                 &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
@@ -7151,12 +7155,16 @@ mod tests {
         }
 
         for query_lsn in query_lsns {
+            let query = VersionedKeySpaceQuery::uniform(
+                KeySpace {
+                    ranges: vec![child_gap_at_key..child_gap_at_key.next()],
+                },
+                query_lsn,
+            );
+
             let results = child_timeline
                 .get_vectored_impl(
-                    KeySpace {
-                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
-                    },
-                    query_lsn,
+                    query,
                     &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
@@ -7655,10 +7663,11 @@ mod tests {
             }
 
             let mut cnt = 0;
+            let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
+
             for (key, value) in tline
                 .get_vectored_impl(
-                    keyspace.clone(),
-                    lsn,
+                    query,
                     &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
@@ -7865,8 +7874,9 @@ mod tests {
             io_concurrency: IoConcurrency,
         ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
             let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+            let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
             let res = tline
-                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                .get_vectored_impl(query, &mut reconstruct_state, ctx)
                 .await?;
             Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
         }
@@ -8163,13 +8173,10 @@ mod tests {
 
         // test vectored scan on parent timeline
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+        let query =
+            VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
         let res = tline
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, &ctx)
             .await?;
 
         assert_eq!(
@@ -8189,13 +8196,10 @@ mod tests {
 
         // test vectored scan on child timeline
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+        let query =
+            VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
         let res = child
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, &ctx)
             .await?;
 
         assert_eq!(
@@ -8229,13 +8233,9 @@ mod tests {
         let io_concurrency =
             IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
         let mut res = tline
-            .get_vectored_impl(
-                KeySpace::single(key..key.next()),
-                lsn,
-                &mut reconstruct_state,
-                ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, ctx)
             .await?;
         Ok(res.pop_last().map(|(k, v)| {
             assert_eq!(k, key);
@@ -10369,14 +10369,13 @@ mod tests {
             )
             .await?;
 
-        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let query = VersionedKeySpaceQuery::uniform(
+            KeySpace::single(get_key(0)..get_key(10)),
+            delta_layer_end_lsn,
+        );
+
         let results = tline
-            .get_vectored(
-                keyspace,
-                delta_layer_end_lsn,
-                IoConcurrency::sequential(),
-                &ctx,
-            )
+            .get_vectored(query, IoConcurrency::sequential(), &ctx)
             .await
             .expect("No vectored errors");
         for (key, res) in results {
@@ -10524,9 +10523,13 @@ mod tests {
             )
             .await?;
 
-        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let query = VersionedKeySpaceQuery::uniform(
+            KeySpace::single(get_key(0)..get_key(10)),
+            last_record_lsn,
+        );
+
         let results = tline
-            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .get_vectored(query, IoConcurrency::sequential(), &ctx)
             .await
             .expect("No vectored errors");
         for (key, res) in results {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 2ea0c1b979..796ad01e54 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -715,13 +715,34 @@ pub(crate) enum LayerId {
 }
 
 /// Uniquely identify a layer visit by the layer
-/// and LSN floor (or start LSN) of the reads.
-/// The layer itself is not enough since we may
-/// have different LSN lower bounds for delta layer reads.
+/// and LSN range of the reads. Note that the end of the range is exclusive.
+///
+/// The layer itself is not enough since we may have different LSN lower
+/// bounds for delta layer reads. Scenarios where this can happen are:
+///
+/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
+///    and a query that only partially hits the image layer. Part of the query
+///    needs to read the whole in-memory layer and the other part needs to read
+///    only up to the image layer. Hence, they'll have different LSN floor values
+///    for the read.
+///
+/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
+///    The start LSN for one range is inside a layer and the start LSN for another range
+///    Is above the layer (includes all of it). Both ranges need to read the layer all the
+///    Way to the end but starting at different points. Hence, they'll have different LSN
+///    Ceil values.
+///
+/// The implication is that we might visit the same layer multiple times
+/// in order to read different LSN ranges from it. In practice, this isn't very concerning
+/// because:
+/// 1. Layer overlaps are rare and generally not intended
+/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
+///    are grouped tightly enough (likely the case).
 #[derive(Debug, PartialEq, Eq, Clone, Hash)]
 struct LayerToVisitId {
     layer_id: LayerId,
     lsn_floor: Lsn,
+    lsn_ceil: Lsn,
 }
 
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -805,6 +826,7 @@ impl LayerFringe {
         let layer_to_visit_id = LayerToVisitId {
             layer_id: layer.id(),
             lsn_floor: lsn_range.start,
+            lsn_ceil: lsn_range.end,
         };
 
         let entry = self.visit_reads.entry(layer_to_visit_id.clone());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 67a16db040..7d4eb0cd82 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -585,7 +585,7 @@ pub(crate) enum PageReconstructError {
     WalRedo(anyhow::Error),
 
     #[error("{0}")]
-    MissingKey(MissingKeyError),
+    MissingKey(Box<MissingKeyError>),
 }
 
 impl From<anyhow::Error> for PageReconstructError {
@@ -690,16 +690,23 @@ impl std::fmt::Display for ReadPath {
 
 #[derive(thiserror::Error)]
 pub struct MissingKeyError {
-    key: Key,
+    keyspace: KeySpace,
     shard: ShardNumber,
-    cont_lsn: Lsn,
-    request_lsn: Lsn,
+    query: Option<VersionedKeySpaceQuery>,
+    // This is largest request LSN from the get page request batch
+    original_hwm_lsn: Lsn,
     ancestor_lsn: Option<Lsn>,
     /// Debug information about the read path if there's an error
     read_path: Option<ReadPath>,
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
+impl MissingKeyError {
+    fn enrich(&mut self, query: VersionedKeySpaceQuery) {
+        self.query = Some(query);
+    }
+}
+
 impl std::fmt::Debug for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self)
@@ -710,14 +717,18 @@ impl std::fmt::Display for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
-            self.key, self.shard, self.cont_lsn, self.request_lsn
+            "could not find data for key {} (shard {:?}), original HWM LSN {}",
+            self.keyspace, self.shard, self.original_hwm_lsn
         )?;
 
         if let Some(ref ancestor_lsn) = self.ancestor_lsn {
             write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
+        if let Some(ref query) = self.query {
+            write!(f, ", query {}", query)?;
+        }
+
         if let Some(ref read_path) = self.read_path {
             write!(f, "\n{}", read_path)?;
         }
@@ -817,7 +828,7 @@ pub(crate) enum GetVectoredError {
     InvalidLsn(Lsn),
 
     #[error("requested key not found: {0}")]
-    MissingKey(MissingKeyError),
+    MissingKey(Box<MissingKeyError>),
 
     #[error("ancestry walk")]
     GetReadyAncestorError(#[source] GetReadyAncestorError),
@@ -928,7 +939,7 @@ impl std::fmt::Debug for Timeline {
     }
 }
 
-#[derive(thiserror::Error, Debug)]
+#[derive(thiserror::Error, Debug, Clone)]
 pub(crate) enum WaitLsnError {
     // Called on a timeline which is shutting down
     #[error("Shutdown")]
@@ -1128,14 +1139,12 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        let keyspace = KeySpace {
-            ranges: vec![key..key.next()],
-        };
-
         let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential());
 
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
+
         let vectored_res = self
-            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+            .get_vectored_impl(query, &mut reconstruct_state, ctx)
             .await;
 
         let key_value = vectored_res?.pop_first();
@@ -1153,15 +1162,17 @@ impl Timeline {
                     value
                 }
             }
-            None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                key,
-                shard: self.shard_identity.get_shard_number(&key),
-                cont_lsn: Lsn(0),
-                request_lsn: lsn,
-                ancestor_lsn: None,
-                backtrace: None,
-                read_path: None,
-            })),
+            None => Err(PageReconstructError::MissingKey(Box::new(
+                MissingKeyError {
+                    keyspace: KeySpace::single(key..key.next()),
+                    shard: self.shard_identity.get_shard_number(&key),
+                    original_hwm_lsn: lsn,
+                    ancestor_lsn: None,
+                    backtrace: None,
+                    read_path: None,
+                    query: None,
+                },
+            ))),
         }
     }
 
@@ -1174,21 +1185,18 @@ impl Timeline {
     /// which actually vectorizes the read path.
     pub(crate) async fn get_vectored(
         &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
+        query: VersionedKeySpaceQuery,
         io_concurrency: super::storage_layer::IoConcurrency,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        if !lsn.is_valid() {
-            return Err(GetVectoredError::InvalidLsn(lsn));
-        }
+        let total_keyspace = query.total_keyspace();
 
-        let key_count = keyspace.total_raw_size().try_into().unwrap();
+        let key_count = total_keyspace.total_raw_size().try_into().unwrap();
         if key_count > Timeline::MAX_GET_VECTORED_KEYS {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
-        for range in &keyspace.ranges {
+        for range in &total_keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
                 assert!(!self.shard_identity.is_key_disposable(&key));
@@ -1197,9 +1205,8 @@ impl Timeline {
         }
 
         trace!(
-            "get vectored request for {:?}@{} from task kind {:?}",
-            keyspace,
-            lsn,
+            "get vectored query {} from task kind {:?}",
+            query,
             ctx.task_kind(),
         );
 
@@ -1208,12 +1215,7 @@ impl Timeline {
             .map(|metric| (metric, Instant::now()));
 
         let res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(io_concurrency),
-                ctx,
-            )
+            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
             .await;
 
         if let Some((metric, start)) = start {
@@ -1264,13 +1266,10 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(ScanLatencyOngoingRecording::start_recording);
 
+        let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);
+
         let vectored_res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(io_concurrency),
-                ctx,
-            )
+            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
             .await;
 
         if let Some(recording) = start {
@@ -1282,16 +1281,19 @@ impl Timeline {
 
     pub(super) async fn get_vectored_impl(
         &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
+        query: VersionedKeySpaceQuery,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
-            Some(ReadPath::new(keyspace.clone(), lsn))
+            Some(ReadPath::new(
+                query.total_keyspace(),
+                query.high_watermark_lsn()?,
+            ))
         } else {
             None
         };
+
         reconstruct_state.read_path = read_path;
 
         let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
@@ -1311,7 +1313,7 @@ impl Timeline {
                 })
                 .attached_child();
 
-            self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx)
+            self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx)
                 .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                 .await
         };
@@ -1324,6 +1326,13 @@ impl Timeline {
                 .map(|state| state.collect_pending_ios())
                 .collect::<FuturesUnordered<_>>();
             while collect_futs.next().await.is_some() {}
+
+            // Enrich the missing key error with the original query.
+            if let GetVectoredError::MissingKey(mut missing_err) = err {
+                missing_err.enrich(query.clone());
+                return Err(GetVectoredError::MissingKey(missing_err));
+            }
+
             return Err(err);
         };
 
@@ -1341,6 +1350,8 @@ impl Timeline {
 
         let futs = FuturesUnordered::new();
         for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
+            let req_lsn_for_key = query.map_key_to_lsn(&key);
+
             futs.push({
                 let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
                 let ctx = RequestContextBuilder::from(&ctx)
@@ -1387,7 +1398,7 @@ impl Timeline {
 
                     let walredo_deltas = converted.num_deltas();
                     let walredo_res = walredo_self
-                        .reconstruct_value(key, lsn, converted, redo_attempt_type)
+                        .reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type)
                         .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                             info_span!(
                                 target: PERF_TRACE_TARGET,
@@ -1414,15 +1425,18 @@ impl Timeline {
         // to avoid infinite results.
         if !results.is_empty() {
             if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                let total_keyspace = query.total_keyspace();
+                let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
+
                 static LOG_PACER: Lazy<Mutex<RateLimit>> =
                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
                 LOG_PACER.lock().unwrap().call(|| {
-                    let num_keys = keyspace.total_raw_size();
+                    let num_keys = total_keyspace.total_raw_size();
                     let num_pages = results.len();
                     tracing::info!(
                       shard_id = %self.tenant_shard_id.shard_slug(),
-                      lsn = %lsn,
-                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                      lsn = %max_request_lsn,
+                      "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
                     );
                 });
             }
@@ -3940,6 +3954,154 @@ impl Timeline {
     }
 }
 
+#[derive(Clone)]
+/// Type representing a query in the ([`Lsn`], [`Key`]) space.
+/// In other words, a set of segments in a 2D space.
+///
+/// This representation has the advatange of avoiding hash map
+/// allocations for uniform queries.
+pub(crate) enum VersionedKeySpaceQuery {
+    /// Variant for queries at a single [`Lsn`]
+    Uniform { keyspace: KeySpace, lsn: Lsn },
+    /// Variant for queries at multiple [`Lsn`]s
+    Scattered {
+        keyspaces_at_lsn: Vec<(Lsn, KeySpace)>,
+    },
+}
+
+impl VersionedKeySpaceQuery {
+    pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self {
+        Self::Uniform { keyspace, lsn }
+    }
+
+    pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self {
+        Self::Scattered { keyspaces_at_lsn }
+    }
+
+    /// Returns the most recent (largest) LSN included in the query.
+    /// If any of the LSNs included in the query are invalid, returns
+    /// an error instead.
+    fn high_watermark_lsn(&self) -> Result<Lsn, GetVectoredError> {
+        match self {
+            Self::Uniform { lsn, .. } => {
+                if !lsn.is_valid() {
+                    return Err(GetVectoredError::InvalidLsn(*lsn));
+                }
+
+                Ok(*lsn)
+            }
+            Self::Scattered { keyspaces_at_lsn } => {
+                let mut max_lsn = None;
+                for (lsn, _keyspace) in keyspaces_at_lsn.iter() {
+                    if !lsn.is_valid() {
+                        return Err(GetVectoredError::InvalidLsn(*lsn));
+                    }
+                    max_lsn = std::cmp::max(max_lsn, Some(lsn));
+                }
+
+                if let Some(computed) = max_lsn {
+                    Ok(*computed)
+                } else {
+                    Err(GetVectoredError::Other(anyhow!("empty input")))
+                }
+            }
+        }
+    }
+
+    /// Returns the total keyspace being queried: the result of projecting
+    /// everything in the key dimensions onto the key axis.
+    fn total_keyspace(&self) -> KeySpace {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.clone(),
+            Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
+                .iter()
+                .map(|(_lsn, keyspace)| keyspace)
+                .fold(KeySpace::default(), |mut acc, v| {
+                    acc.merge(v);
+                    acc
+                }),
+        }
+    }
+
+    /// Returns LSN for a specific key.
+    ///
+    /// Invariant: requested key must be part of [`Self::total_keyspace`]
+    fn map_key_to_lsn(&self, key: &Key) -> Lsn {
+        match self {
+            Self::Uniform { lsn, .. } => *lsn,
+            Self::Scattered { keyspaces_at_lsn } => {
+                keyspaces_at_lsn
+                    .iter()
+                    .find(|(_lsn, keyspace)| keyspace.contains(key))
+                    .expect("Returned key was requested")
+                    .0
+            }
+        }
+    }
+
+    /// Remove any parts of the query (segments) which overlap with the provided
+    /// key space (also segments).
+    fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove),
+            Self::Scattered { keyspaces_at_lsn } => {
+                let mut removed_accum = KeySpaceRandomAccum::new();
+                keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| {
+                    let removed = keyspace.remove_overlapping_with(to_remove);
+                    removed_accum.add_keyspace(removed);
+                });
+
+                removed_accum.to_keyspace()
+            }
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.is_empty(),
+            Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
+                .iter()
+                .all(|(_lsn, keyspace)| keyspace.is_empty()),
+        }
+    }
+
+    /// "Lower" the query on the LSN dimension
+    fn lower(&mut self, to: Lsn) {
+        match self {
+            Self::Uniform { lsn, .. } => {
+                // If the originally requested LSN is smaller than the starting
+                // LSN of the ancestor we are descending into, we need to respect that.
+                // Hence the min.
+                *lsn = std::cmp::min(*lsn, to);
+            }
+            Self::Scattered { keyspaces_at_lsn } => {
+                keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| {
+                    *lsn = std::cmp::min(*lsn, to);
+                });
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for VersionedKeySpaceQuery {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+
+        match self {
+            VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
+                write!(f, "{keyspace} @ {lsn}")?;
+            }
+            VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
+                for (lsn, keyspace) in keyspaces_at_lsn.iter() {
+                    write!(f, "{keyspace} @ {lsn},")?;
+                }
+            }
+        }
+
+        write!(f, "]")
+    }
+}
+
 impl Timeline {
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
@@ -3954,16 +4116,15 @@ impl Timeline {
     /// 2.4. If the fringe is empty, go back to 1
     async fn get_vectored_reconstruct_data(
         &self,
-        mut keyspace: KeySpace,
-        request_lsn: Lsn,
+        mut query: VersionedKeySpaceQuery,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
+        let original_hwm_lsn = query.high_watermark_lsn().unwrap();
+
         let mut timeline_owned: Arc<Timeline>;
         let mut timeline = self;
 
-        let mut cont_lsn = Lsn(request_lsn.0 + 1);
-
         let missing_keyspace = loop {
             if self.cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
@@ -3980,15 +4141,14 @@ impl Timeline {
                             parent: crnt_perf_span,
                             "PLAN_IO_TIMELINE",
                             timeline = %timeline.timeline_id,
-                            lsn = %cont_lsn,
+                            high_watermark_lsn = %query.high_watermark_lsn().unwrap(),
                         )
                     })
                     .attached_child();
 
                 Self::get_vectored_reconstruct_data_timeline(
                     timeline,
-                    keyspace.clone(),
-                    cont_lsn,
+                    &query,
                     reconstruct_state,
                     &self.cancel,
                     &ctx,
@@ -3997,23 +4157,23 @@ impl Timeline {
                 .await?
             };
 
-            keyspace.remove_overlapping_with(&completed);
+            query.remove_overlapping_with(&completed);
 
             // Do not descend into the ancestor timeline for aux files.
             // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
             // stalling compaction.
-            keyspace.remove_overlapping_with(&KeySpace {
+            query.remove_overlapping_with(&KeySpace {
                 ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
             });
 
             // Keyspace is fully retrieved
-            if keyspace.is_empty() {
+            if query.is_empty() {
                 break None;
             }
 
             let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
                 // Not fully retrieved but no ancestor timeline.
-                break Some(keyspace);
+                break Some(query.total_keyspace());
             };
 
             // Now we see if there are keys covered by the image layer but does not exist in the
@@ -4024,7 +4184,7 @@ impl Timeline {
             // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
             // space. If that's not the case, we had at least one key encounter a gap in the image layer
             // and stop the search as a result of that.
-            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = query.remove_overlapping_with(&image_covered_keyspace);
             // Do not fire missing key error and end early for sparse keys. Note that we hava already removed
             // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
             // figuring out what is the inherited key range and do a fine-grained pruning.
@@ -4034,11 +4194,11 @@ impl Timeline {
             if !removed.is_empty() {
                 break Some(removed);
             }
-            // If we reached this point, `remove_overlapping_with` should not have made any change to the
-            // keyspace.
 
-            // Take the min to avoid reconstructing a page with data newer than request Lsn.
-            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
+            // Each key range in the original query is at some point in the LSN space.
+            // When descending into the ancestor, lower all ranges in the LSN space
+            // such that new changes on the parent timeline are not visible.
+            query.lower(timeline.ancestor_lsn);
 
             let ctx = RequestContextBuilder::from(ctx)
                 .perf_span(|crnt_perf_span| {
@@ -4047,7 +4207,6 @@ impl Timeline {
                         parent: crnt_perf_span,
                         "GET_ANCESTOR",
                         timeline = %timeline.timeline_id,
-                        lsn = %cont_lsn,
                         ancestor = %ancestor_timeline.timeline_id,
                         ancestor_lsn = %timeline.ancestor_lsn
                     )
@@ -4077,22 +4236,47 @@ impl Timeline {
         };
 
         if let Some(missing_keyspace) = missing_keyspace {
-            return Err(GetVectoredError::MissingKey(MissingKeyError {
-                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
-                shard: self
-                    .shard_identity
-                    .get_shard_number(&missing_keyspace.start().unwrap()),
-                cont_lsn,
-                request_lsn,
+            return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError {
+                keyspace: missing_keyspace, /* better if we can store the full keyspace */
+                shard: self.shard_identity.number,
+                original_hwm_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
                 backtrace: None,
                 read_path: std::mem::take(&mut reconstruct_state.read_path),
-            }));
+                query: None,
+            })));
         }
 
         Ok(())
     }
 
+    async fn get_vectored_init_fringe(
+        &self,
+        query: &VersionedKeySpaceQuery,
+    ) -> Result<LayerFringe, GetVectoredError> {
+        let mut fringe = LayerFringe::new();
+        let guard = self.layers.read().await;
+
+        match query {
+            VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
+                // LSNs requested by the compute or determined by the pageserver
+                // are inclusive. Queries to the layer map use exclusive LSNs.
+                // Hence, bump the value before the query - same in the other
+                // match arm.
+                let cont_lsn = Lsn(lsn.0 + 1);
+                guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?;
+            }
+            VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
+                for (lsn, keyspace) in keyspaces_at_lsn.iter() {
+                    let cont_lsn_for_keyspace = Lsn(lsn.0 + 1);
+                    guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?;
+                }
+            }
+        }
+
+        Ok(fringe)
+    }
+
     /// Collect the reconstruct data for a keyspace from the specified timeline.
     ///
     /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
@@ -4111,8 +4295,7 @@ impl Timeline {
     /// decides how to deal with these two keyspaces.
     async fn get_vectored_reconstruct_data_timeline(
         timeline: &Timeline,
-        keyspace: KeySpace,
-        mut cont_lsn: Lsn,
+        query: &VersionedKeySpaceQuery,
         reconstruct_state: &mut ValuesReconstructState,
         cancel: &CancellationToken,
         ctx: &RequestContext,
@@ -4128,14 +4311,7 @@ impl Timeline {
         let _guard = timeline.gc_compaction_layer_update_lock.read().await;
 
         // Initialize the fringe
-        let mut fringe = {
-            let mut fringe = LayerFringe::new();
-
-            let guard = timeline.layers.read().await;
-            guard.update_search_fringe(&keyspace, cont_lsn, &mut fringe)?;
-
-            fringe
-        };
+        let mut fringe = timeline.get_vectored_init_fringe(query).await?;
 
         let mut completed_keyspace = KeySpace::default();
         let mut image_covered_keyspace = KeySpaceRandomAccum::new();
@@ -4161,7 +4337,7 @@ impl Timeline {
                 .await?;
 
             let mut unmapped_keyspace = keyspace_to_read;
-            cont_lsn = next_cont_lsn;
+            let cont_lsn = next_cont_lsn;
 
             reconstruct_state.on_layer_visited(&layer_to_read);
 
@@ -4996,13 +5172,11 @@ impl Timeline {
                 if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
                     || (last_key_in_range && key_request_accum.raw_size() > 0)
                 {
+                    let query =
+                        VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn);
+
                     let results = self
-                        .get_vectored(
-                            key_request_accum.consume_keyspace(),
-                            lsn,
-                            io_concurrency.clone(),
-                            ctx,
-                        )
+                        .get_vectored(query, io_concurrency.clone(), ctx)
                         .await?;
 
                     if self.cancel.is_cancelled() {
@@ -5091,7 +5265,11 @@ impl Timeline {
         // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
         // not contain too many keys, otherwise this takes a lot of memory.
         let data = self
-            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
+            .get_vectored_impl(
+                VersionedKeySpaceQuery::uniform(partition.clone(), lsn),
+                &mut reconstruct_state,
+                ctx,
+            )
             .await?;
         let (data, total_kb_retrieved, total_keys_retrieved) = {
             let mut new_data = BTreeMap::new();
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 1b0d22dc82..7c61f32d1e 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -30,6 +30,7 @@ use crate::tenant::storage_layer::{
     AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
     ValuesReconstructState,
 };
+use crate::tenant::timeline::VersionedKeySpaceQuery;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 #[derive(Debug, thiserror::Error)]
@@ -212,13 +213,9 @@ async fn generate_tombstone_image_layer(
         }
     }
 
+    let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn);
     let data = ancestor
-        .get_vectored_impl(
-            KeySpace::single(key_range.clone()),
-            image_lsn,
-            &mut reconstruct_state,
-            ctx,
-        )
+        .get_vectored_impl(query, &mut reconstruct_state, ctx)
         .await
         .context("failed to retrieve aux keys")
         .map_err(|e| Error::launder(e, Error::Prepare))?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ba2101e427..c846c0950e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1255,6 +1255,7 @@ class NeonEnv:
                 "mode": "pipelined",
                 "execution": "concurrent-futures",
                 "max_batch_size": 32,
+                "batching": "scattered-lsn",
             }
 
             get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
@@ -1321,6 +1322,10 @@ class NeonEnv:
                 log.info("test may use old binaries, ignoring warnings about unknown config items")
                 ps.allowed_errors.append(".*ignoring unknown configuration item.*")
 
+                # Allow old software to start until https://github.com/neondatabase/neon/pull/11275
+                # lands in the compatiblity snapshot.
+                ps_cfg["page_service_pipelining"].pop("batching")
+
             self.pageservers.append(ps)
             cfg["pageservers"].append(ps_cfg)
 
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 5169add6cb..520a019cf5 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -31,20 +31,28 @@ class PageServicePipeliningConfigSerial(PageServicePipeliningConfig):
 class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
     max_batch_size: int
     execution: str
+    batching: str
     mode: str = "pipelined"
 
 
 EXECUTION = ["concurrent-futures"]
+BATCHING = ["uniform-lsn", "scattered-lsn"]
 
 NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
     for execution in EXECUTION:
-        NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            NON_BATCHABLE.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )
 
 BATCHABLE: list[PageServicePipeliningConfig] = []
 for max_batch_size in [32]:
     for execution in EXECUTION:
-        BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            BATCHABLE.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )
 
 
 @pytest.mark.parametrize(
@@ -300,7 +308,10 @@ def test_throughput(
 PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
     for execution in EXECUTION:
-        PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            PRECISION_CONFIGS.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )
 
 
 @pytest.mark.parametrize(
diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py
index fa85e1210b..50303a4986 100644
--- a/test_runner/regress/test_page_service_batching_regressions.py
+++ b/test_runner/regress/test_page_service_batching_regressions.py
@@ -16,6 +16,7 @@ def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind:
             "mode": "pipelined",
             "max_batch_size": 32,
             "execution": "concurrent-futures",
+            "batching": "uniform-lsn",
         }
 
     neon_env_builder.pageserver_config_override = patch_pageserver_toml

From 307fa2ceb7ed68c39e30312c45848cda90225a84 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 14 Apr 2025 12:45:13 +0300
Subject: [PATCH 123/140] Remove unused n_synced variable from
 HandleSafekeeperResponse (#11553)

## Problem

clang produce warning about unused variable `n_synced` in
HandleSafekeeperResponse

## Summary of changes

Remove local variable.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 6b133e4dc4..b95b1451e4 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -2118,9 +2118,6 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	 */
 	if (wp->config->syncSafekeepers)
 	{
-		int			n_synced;
-
-		n_synced = 0;
 		for (int i = 0; i < wp->n_safekeepers; i++)
 		{
 			Safekeeper *sk = &wp->safekeeper[i];
@@ -2129,8 +2126,6 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 			/* alive safekeeper which is not synced yet; wait for it */
 			if (sk->state != SS_OFFLINE && !synced)
 				return;
-			if (synced)
-				n_synced++;
 		}
 
 		if (newCommitLsn >= wp->propTermStartLsn)

From e0ee6fbeffac246b240ed3e4ec9082773edc7a1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 14 Apr 2025 12:36:40 +0200
Subject: [PATCH 124/140] Remove deprecated --compute-hook-url storcon param
 (#11551)

We have already migrated the storage controller to
`--control-plane-url`, added in #11173. The new param was added to
support also safekeeper specific endpoints. See the docs changes in
#11195 for further details.

Part of #11163
---
 storage_controller/src/compute_hook.rs | 16 +++++++---------
 storage_controller/src/main.rs         | 11 ++---------
 storage_controller/src/service.rs      |  8 --------
 3 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 2311cadb36..57709302e1 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -629,15 +629,13 @@ impl ComputeHook {
         };
 
         let result = if !self.config.use_local_compute_notifications {
-            let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url {
-                Some(if control_plane_url.ends_with('/') {
-                    format!("{control_plane_url}notify-attach")
-                } else {
-                    format!("{control_plane_url}/notify-attach")
-                })
-            } else {
-                self.config.compute_hook_url.clone()
-            };
+            let compute_hook_url =
+                self.config
+                    .control_plane_url
+                    .as_ref()
+                    .map(|control_plane_url| {
+                        format!("{}/notify-attach", control_plane_url.trim_end_matches('/'))
+                    });
 
             // We validate this at startup
             let notify_url = compute_hook_url.as_ref().unwrap();
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 9358c9da4d..a924e5b6c5 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -86,10 +86,6 @@ struct Cli {
     #[arg(long)]
     peer_jwt_token: Option<String>,
 
-    /// URL to control plane compute notification endpoint
-    #[arg(long)]
-    compute_hook_url: Option<String>,
-
     /// URL to control plane storage API prefix
     #[arg(long)]
     control_plane_url: Option<String>,
@@ -360,13 +356,11 @@ async fn async_main() -> anyhow::Result<()> {
                 "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
             );
         }
-        StrictMode::Strict
-            if args.compute_hook_url.is_none() && args.control_plane_url.is_none() =>
-        {
+        StrictMode::Strict if args.control_plane_url.is_none() => {
             // Production systems should always have a control plane URL set, to prevent falling
             // back to trying to use neon_local.
             anyhow::bail!(
-                "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode"
+                "`--control-plane-url` is not set: this is only permitted in `--dev` mode"
             );
         }
         StrictMode::Strict if args.use_local_compute_notifications => {
@@ -394,7 +388,6 @@ async fn async_main() -> anyhow::Result<()> {
         safekeeper_jwt_token: secrets.safekeeper_jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
         peer_jwt_token: secrets.peer_jwt_token,
-        compute_hook_url: args.compute_hook_url,
         control_plane_url: args.control_plane_url,
         max_offline_interval: args
             .max_offline_interval
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0982e56155..a021313474 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -357,18 +357,10 @@ pub struct Config {
     // This JWT token will be used to authenticate with other storage controller instances
     pub peer_jwt_token: Option<String>,
 
-    /// Where the compute hook should send notifications of pageserver attachment locations
-    /// (this URL points to the control plane in prod). If this is None, the compute hook will
-    /// assume it is running in a test environment and try to update neon_local.
-    pub compute_hook_url: Option<String>,
-
     /// Prefix for storage API endpoints of the control plane. We use this prefix to compute
     /// URLs that we use to send pageserver and safekeeper attachment locations.
     /// If this is None, the compute hook will assume it is running in a test environment
     /// and try to invoke neon_local instead.
-    ///
-    /// For now, there is also `compute_hook_url` which allows configuration of the pageserver
-    /// specific endpoint, but it is in the process of being phased out.
     pub control_plane_url: Option<String>,
 
     /// Grace period within which a pageserver does not respond to heartbeats, but is still

From daebe50e19bc67c14d019073afd5d6cd433e8246 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 14 Apr 2025 13:51:01 +0200
Subject: [PATCH 125/140] refactor: plumb gate and cancellation down to to
 blob_io::BlobWriter (#11543)

In #10063 we will switch BlobWriter to use the owned buffers IO buffered
writer, which implements double-buffering by virtue of a background task
that performs the flushing.

That task's lifecylce must be contained within the Timeline lifecycle,
so, it must hold the timeline gate open and respect Timeline::cancel.

This PR does the noisy plumbing to reduce the #10063 diff.

Refs
- extracted from https://github.com/neondatabase/neon/pull/10063
- epic https://github.com/neondatabase/neon/issues/9868
---
 pageserver/benches/bench_ingest.rs            |  2 +-
 pageserver/src/tenant/blob_io.rs              | 13 +++++-
 .../storage_layer/batch_split_writer.rs       | 44 +++++++++++++++++--
 .../src/tenant/storage_layer/delta_layer.rs   | 17 ++++++-
 .../src/tenant/storage_layer/image_layer.rs   | 28 ++++++++++--
 .../tenant/storage_layer/inmemory_layer.rs    |  4 ++
 pageserver/src/tenant/timeline.rs             | 14 +++++-
 pageserver/src/tenant/timeline/compaction.rs  | 20 ++++++++-
 .../src/tenant/timeline/detach_ancestor.rs    |  4 ++
 .../src/tenant/timeline/import_pgdata/flow.rs |  2 +
 10 files changed, 134 insertions(+), 14 deletions(-)

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 000938b189..3108b5351f 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -126,7 +126,7 @@ async fn ingest(
             max_concurrency: NonZeroUsize::new(1).unwrap(),
         });
         let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
             .await?
             .unwrap();
         tokio::fs::remove_file(path).await?;
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index b0b2a16c2f..abeaa166a4 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -22,6 +22,7 @@ use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tokio_util::sync::CancellationToken;
 use tracing::warn;
 
 use crate::context::RequestContext;
@@ -169,7 +170,13 @@ pub struct BlobWriter<const BUFFERED: bool> {
 }
 
 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
-    pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
+    pub fn new(
+        inner: VirtualFile,
+        start_offset: u64,
+        _gate: &utils::sync::gate::Gate,
+        _cancel: CancellationToken,
+        _ctx: &RequestContext,
+    ) -> Self {
         Self {
             inner,
             offset: start_offset,
@@ -432,12 +439,14 @@ pub(crate) mod tests {
     ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
+        let gate = utils::sync::gate::Gate::default();
+        let cancel = CancellationToken::new();
 
         // Write part (in block to drop the file)
         let mut offsets = Vec::new();
         {
             let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
-            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
+            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
             for blob in blobs.iter() {
                 let (_, res) = if compression {
                     let res = wtr
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 29ada15c36..39cd02d101 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 use bytes::Bytes;
 use pageserver_api::key::{KEY_SIZE, Key};
 use pageserver_api::value::Value;
+use tokio_util::sync::CancellationToken;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::shard::TenantShardId;
@@ -179,7 +180,7 @@ impl BatchLayerWriter {
 
 /// An image writer that takes images and produces multiple image layers.
 #[must_use]
-pub struct SplitImageLayerWriter {
+pub struct SplitImageLayerWriter<'a> {
     inner: ImageLayerWriter,
     target_layer_size: u64,
     lsn: Lsn,
@@ -188,9 +189,12 @@ pub struct SplitImageLayerWriter {
     tenant_shard_id: TenantShardId,
     batches: BatchLayerWriter,
     start_key: Key,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }
 
-impl SplitImageLayerWriter {
+impl<'a> SplitImageLayerWriter<'a> {
+    #[allow(clippy::too_many_arguments)]
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
@@ -198,6 +202,8 @@ impl SplitImageLayerWriter {
         start_key: Key,
         lsn: Lsn,
         target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
@@ -208,6 +214,8 @@ impl SplitImageLayerWriter {
                 tenant_shard_id,
                 &(start_key..Key::MAX),
                 lsn,
+                gate,
+                cancel.clone(),
                 ctx,
             )
             .await?,
@@ -217,6 +225,8 @@ impl SplitImageLayerWriter {
             batches: BatchLayerWriter::new(conf).await?,
             lsn,
             start_key,
+            gate,
+            cancel,
         })
     }
 
@@ -239,6 +249,8 @@ impl SplitImageLayerWriter {
                 self.tenant_shard_id,
                 &(key..Key::MAX),
                 self.lsn,
+                self.gate,
+                self.cancel.clone(),
                 ctx,
             )
             .await?;
@@ -291,7 +303,7 @@ impl SplitImageLayerWriter {
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
 #[must_use]
-pub struct SplitDeltaLayerWriter {
+pub struct SplitDeltaLayerWriter<'a> {
     inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
     conf: &'static PageServerConf,
@@ -300,15 +312,19 @@ pub struct SplitDeltaLayerWriter {
     lsn_range: Range<Lsn>,
     last_key_written: Key,
     batches: BatchLayerWriter,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }
 
-impl SplitDeltaLayerWriter {
+impl<'a> SplitDeltaLayerWriter<'a> {
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         lsn_range: Range<Lsn>,
         target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             target_layer_size,
@@ -319,6 +335,8 @@ impl SplitDeltaLayerWriter {
             lsn_range,
             last_key_written: Key::MIN,
             batches: BatchLayerWriter::new(conf).await?,
+            gate,
+            cancel,
         })
     }
 
@@ -344,6 +362,8 @@ impl SplitDeltaLayerWriter {
                     self.tenant_shard_id,
                     key,
                     self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                     ctx,
                 )
                 .await?,
@@ -362,6 +382,8 @@ impl SplitDeltaLayerWriter {
                     self.tenant_shard_id,
                     key,
                     self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                     ctx,
                 )
                 .await?;
@@ -469,6 +491,8 @@ mod tests {
             get_key(0),
             Lsn(0x18),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
             &ctx,
         )
         .await
@@ -480,6 +504,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
@@ -546,6 +572,8 @@ mod tests {
             get_key(0),
             Lsn(0x18),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
             &ctx,
         )
         .await
@@ -556,6 +584,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
@@ -643,6 +673,8 @@ mod tests {
             get_key(0),
             Lsn(0x18),
             4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
             &ctx,
         )
         .await
@@ -654,6 +686,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x18)..Lsn(0x20),
             4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
@@ -730,6 +764,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 05b0bc1a5c..4417b8aa51 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -50,6 +50,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_epoll_uring::IoBuf;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -400,12 +401,15 @@ impl DeltaLayerWriterInner {
     ///
     /// Start building a new delta layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename. We don't know
@@ -420,7 +424,7 @@ impl DeltaLayerWriterInner {
         let mut file = VirtualFile::create(&path, ctx).await?;
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
 
         // Initialize the b-tree index builder
         let block_buf = BlockBuf::new();
@@ -628,12 +632,15 @@ impl DeltaLayerWriter {
     ///
     /// Start building a new delta layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
@@ -644,6 +651,8 @@ impl DeltaLayerWriter {
                     tenant_shard_id,
                     key_start,
                     lsn_range,
+                    gate,
+                    cancel,
                     ctx,
                 )
                 .await?,
@@ -1885,6 +1894,8 @@ pub(crate) mod test {
             harness.tenant_shard_id,
             entries_meta.key_range.start,
             entries_meta.lsn_range.clone(),
+            &timeline.gate,
+            timeline.cancel.clone(),
             &ctx,
         )
         .await?;
@@ -2079,6 +2090,8 @@ pub(crate) mod test {
                 tenant.tenant_shard_id,
                 Key::MIN,
                 Lsn(0x11)..truncate_at,
+                &branch.gate,
+                branch.cancel.clone(),
                 ctx,
             )
             .await
@@ -2213,6 +2226,8 @@ pub(crate) mod test {
             tenant.tenant_shard_id,
             *key_start,
             (*lsn_min)..lsn_end,
+            &tline.gate,
+            tline.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 72992e5031..3744d615f2 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -48,6 +48,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -748,12 +749,15 @@ impl ImageLayerWriterInner {
     ///
     /// Start building a new image layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename.
@@ -780,7 +784,7 @@ impl ImageLayerWriterInner {
         };
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
 
         // Initialize the b-tree index builder
         let block_buf = BlockBuf::new();
@@ -988,18 +992,30 @@ impl ImageLayerWriter {
     ///
     /// Start building a new image layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<ImageLayerWriter> {
         Ok(Self {
             inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
-                    .await?,
+                ImageLayerWriterInner::new(
+                    conf,
+                    timeline_id,
+                    tenant_shard_id,
+                    key_range,
+                    lsn,
+                    gate,
+                    cancel,
+                    ctx,
+                )
+                .await?,
             ),
         })
     }
@@ -1203,6 +1219,8 @@ mod test {
                 harness.tenant_shard_id,
                 &range,
                 lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                 &ctx,
             )
             .await
@@ -1268,6 +1286,8 @@ mod test {
                 harness.tenant_shard_id,
                 &range,
                 lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                 &ctx,
             )
             .await
@@ -1346,6 +1366,8 @@ mod test {
             tenant.tenant_shard_id,
             &key_range,
             lsn,
+            &tline.gate,
+            tline.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 388ed3201c..5d558e66cc 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -719,6 +719,8 @@ impl InMemoryLayer {
         ctx: &RequestContext,
         key_range: Option<Range<Key>>,
         l0_flush_global_state: &l0_flush::Inner,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
     ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
@@ -759,6 +761,8 @@ impl InMemoryLayer {
             self.tenant_shard_id,
             Key::MIN,
             self.start_lsn..end_lsn,
+            gate,
+            cancel,
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7d4eb0cd82..204bdb5eee 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4986,7 +4986,13 @@ impl Timeline {
         let ctx = ctx.attached_child();
         let work = async move {
             let Some((desc, path)) = frozen_layer
-                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
+                .write_to_disk(
+                    &ctx,
+                    key_range,
+                    self_clone.l0_flush_global_state.inner(),
+                    &self_clone.gate,
+                    self_clone.cancel.clone(),
+                )
                 .await?
             else {
                 return Ok(None);
@@ -5526,6 +5532,8 @@ impl Timeline {
                 self.tenant_shard_id,
                 &img_range,
                 lsn,
+                &self.gate,
+                self.cancel.clone(),
                 ctx,
             )
             .await?;
@@ -6890,6 +6898,8 @@ impl Timeline {
             self.tenant_shard_id,
             &(min_key..end_key),
             lsn,
+            &self.gate,
+            self.cancel.clone(),
             ctx,
         )
         .await?;
@@ -6951,6 +6961,8 @@ impl Timeline {
             self.tenant_shard_id,
             deltas.key_range.start,
             deltas.lsn_range,
+            &self.gate,
+            self.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e3aa5045bb..91cc8ca10c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -749,8 +749,8 @@ impl KeyHistoryRetention {
     async fn pipe_to(
         self,
         key: Key,
-        delta_writer: &mut SplitDeltaLayerWriter,
-        mut image_writer: Option<&mut SplitImageLayerWriter>,
+        delta_writer: &mut SplitDeltaLayerWriter<'_>,
+        mut image_writer: Option<&mut SplitImageLayerWriter<'_>>,
         stat: &mut CompactionStatistics,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
@@ -1394,6 +1394,8 @@ impl Timeline {
                 self.tenant_shard_id,
                 &layer.layer_desc().key_range,
                 layer.layer_desc().image_layer_lsn(),
+                &self.gate,
+                self.cancel.clone(),
                 ctx,
             )
             .await
@@ -2033,6 +2035,8 @@ impl Timeline {
                                 debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                 lsn_range.clone()
                             },
+                            &self.gate,
+                            self.cancel.clone(),
                             ctx,
                         )
                         .await
@@ -3232,6 +3236,8 @@ impl Timeline {
                     job_desc.compaction_key_range.start,
                     lowest_retain_lsn,
                     self.get_compaction_target_size(),
+                    &self.gate,
+                    self.cancel.clone(),
                     ctx,
                 )
                 .await
@@ -3248,6 +3254,8 @@ impl Timeline {
             self.tenant_shard_id,
             lowest_retain_lsn..end_lsn,
             self.get_compaction_target_size(),
+            &self.gate,
+            self.cancel.clone(),
         )
         .await
         .context("failed to create delta layer writer")
@@ -3344,6 +3352,8 @@ impl Timeline {
                                 self.tenant_shard_id,
                                 desc.key_range.start,
                                 desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                 ctx,
                             )
                             .await
@@ -3361,6 +3371,8 @@ impl Timeline {
                                 self.tenant_shard_id,
                                 job_desc.compaction_key_range.end,
                                 desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                 ctx,
                             )
                             .await
@@ -3932,6 +3944,8 @@ impl CompactionJobExecutor for TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range.start,
             lsn_range.clone(),
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
             ctx,
         )
         .await?;
@@ -4007,6 +4021,8 @@ impl TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range,
             lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 7c61f32d1e..a841cc55f0 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -228,6 +228,8 @@ async fn generate_tombstone_image_layer(
             detached.tenant_shard_id,
             &key_range,
             image_lsn,
+            &detached.gate,
+            detached.cancel.clone(),
             ctx,
         )
         .await
@@ -776,6 +778,8 @@ async fn copy_lsn_prefix(
         target_timeline.tenant_shard_id,
         layer.layer_desc().key_range.start,
         layer.layer_desc().lsn_range.start..end_lsn,
+        &target_timeline.gate,
+        target_timeline.cancel.clone(),
         ctx,
     )
     .await
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index 3ef82b3658..c6d2944769 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -738,6 +738,8 @@ impl ChunkProcessingJob {
             self.timeline.tenant_shard_id,
             &self.range,
             self.pgdata_lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
             ctx,
         )
         .await?;

From 148b3701cf5ca48d9e0242074ca0950622b1f1f2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Apr 2025 14:24:47 +0100
Subject: [PATCH 126/140] pageserver: add metrics for get page batch breaking
 reasons (#11545)

## Problem

https://github.com/neondatabase/neon/pull/11494 changes the batching
logic, but we don't have a way to evaluate it.

## Summary of changes

This PR introduces a global and per timeline metric which tracks the
reason for
which a batch was broken.
---
 pageserver/src/metrics.rs                     |  97 +++++-
 pageserver/src/page_service.rs                | 290 +++++++++++-------
 test_runner/fixtures/metrics.py               |   1 +
 .../pageserver/test_page_service_batching.py  |  55 ++--
 4 files changed, 309 insertions(+), 134 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 59bb3410f9..2a779b0daa 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1714,6 +1714,28 @@ pub enum SmgrQueryType {
     Test,
 }
 
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    IntoStaticStr,
+    strum_macros::EnumCount,
+    strum_macros::EnumIter,
+    strum_macros::FromRepr,
+    enum_map::Enum,
+)]
+#[strum(serialize_all = "snake_case")]
+pub enum GetPageBatchBreakReason {
+    BatchFull,
+    NonBatchableRequest,
+    NonUniformLsn,
+    SamePageAtDifferentLsn,
+    NonUniformTimeline,
+    ExecutorSteal,
+    #[cfg(feature = "testing")]
+    NonUniformKey,
+}
+
 pub(crate) struct SmgrQueryTimePerTimeline {
     global_started: [IntCounter; SmgrQueryType::COUNT],
     global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1725,6 +1747,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_flush_in_progress_micros: IntCounter,
     global_batch_wait_time: Histogram,
     per_timeline_batch_wait_time: Histogram,
+    global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
+    per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
     throttling: Arc<tenant_throttling::Pagestream>,
 }
 
@@ -1858,6 +1882,49 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_page_service_batch_break_reason_global",
+        "Reason for breaking batches of get page requests",
+        &["reason"],
+    )
+    .expect("failed to define a metric")
+});
+
+struct GetPageBatchBreakReasonTimelineMetrics {
+    map: EnumMap<GetPageBatchBreakReason, IntCounter>,
+}
+
+impl GetPageBatchBreakReasonTimelineMetrics {
+    fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
+        GetPageBatchBreakReasonTimelineMetrics {
+            map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
+                let reason = GetPageBatchBreakReason::from_usize(reason_idx);
+                PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
+                    tenant_id,
+                    shard_slug,
+                    timeline_id,
+                    reason.into(),
+                ])
+            })),
+        }
+    }
+
+    fn inc(&self, reason: GetPageBatchBreakReason) {
+        self.map[reason].inc()
+    }
+}
+
+static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_batch_break_reason",
+        "Reason for breaking batches of get page requests",
+        &["tenant_id", "shard_id", "timeline_id", "reason"],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_page_service_config_max_batch_size",
@@ -1985,6 +2052,15 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_batch_break_reason = std::array::from_fn(|i| {
+            let reason = GetPageBatchBreakReason::from_usize(i);
+            PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
+                .get_metric_with_label_values(&[reason.into()])
+                .unwrap()
+        });
+        let per_timeline_batch_break_reason =
+            GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
+
         let global_flush_in_progress_micros =
             PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
         let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -2002,6 +2078,8 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_flush_in_progress_micros,
             global_batch_wait_time,
             per_timeline_batch_wait_time,
+            global_batch_break_reason,
+            per_timeline_batch_break_reason,
             throttling: pagestream_throttle_metrics,
         }
     }
@@ -2030,9 +2108,16 @@ impl SmgrQueryTimePerTimeline {
     }
 
     /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
-    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
+    pub(crate) fn observe_getpage_batch_start(
+        &self,
+        batch_size: usize,
+        break_reason: GetPageBatchBreakReason,
+    ) {
         self.global_batch_size.observe(batch_size as f64);
         self.per_timeline_batch_size.observe(batch_size as f64);
+
+        self.global_batch_break_reason[break_reason.into_usize()].inc();
+        self.per_timeline_batch_break_reason.inc(break_reason);
     }
 }
 
@@ -3398,6 +3483,15 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+
+        for reason in GetPageBatchBreakReason::iter() {
+            let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+                reason.into(),
+            ]);
+        }
     }
 }
 
@@ -4276,6 +4370,7 @@ pub fn preinitialize_metrics(
     [
         &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
         &SMGR_QUERY_STARTED_GLOBAL,
+        &PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
     ]
     .into_iter()
     .for_each(|c| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 26eea5183b..7a62d8049b 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -58,8 +58,8 @@ use crate::context::{
     DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::metrics::{
-    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
-    TimelineMetrics,
+    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
+    SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -672,6 +672,7 @@ enum BatchedFeMessage {
         span: Span,
         shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        batch_break_reason: GetPageBatchBreakReason,
     },
     DbSize {
         span: Span,
@@ -724,6 +725,119 @@ impl BatchedFeMessage {
             BatchedFeMessage::RespondError { .. } => {}
         }
     }
+
+    fn should_break_batch(
+        &self,
+        other: &BatchedFeMessage,
+        max_batch_size: NonZeroUsize,
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
+    ) -> Option<GetPageBatchBreakReason> {
+        match (self, other) {
+            (
+                BatchedFeMessage::GetPage {
+                    shard: accum_shard,
+                    pages: accum_pages,
+                    ..
+                },
+                BatchedFeMessage::GetPage {
+                    shard: this_shard,
+                    pages: this_pages,
+                    ..
+                },
+            ) => {
+                assert_eq!(this_pages.len(), 1);
+                if accum_pages.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_pages.len(), max_batch_size.get());
+
+                    return Some(GetPageBatchBreakReason::BatchFull);
+                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+
+                match batching_strategy {
+                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
+                        if let Some(last_in_batch) = accum_pages.last() {
+                            if last_in_batch.effective_request_lsn
+                                != this_pages[0].effective_request_lsn
+                            {
+                                trace!(
+                                    accum_lsn = %last_in_batch.effective_request_lsn,
+                                    this_lsn = %this_pages[0].effective_request_lsn,
+                                    "stopping batching because LSN changed"
+                                );
+
+                                return Some(GetPageBatchBreakReason::NonUniformLsn);
+                            }
+                        }
+                    }
+                    PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
+                        // The read path doesn't curently support serving the same page at different LSNs.
+                        // While technically possible, it's uncertain if the complexity is worth it.
+                        // Break the batch if such a case is encountered.
+                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
+                            batched.req.rel == this_pages[0].req.rel
+                                && batched.req.blkno == this_pages[0].req.blkno
+                                && batched.effective_request_lsn
+                                    != this_pages[0].effective_request_lsn
+                        });
+
+                        if same_page_different_lsn {
+                            trace!(
+                                rel=%this_pages[0].req.rel,
+                                blkno=%this_pages[0].req.blkno,
+                                lsn=%this_pages[0].effective_request_lsn,
+                                "stopping batching because same page was requested at different LSNs"
+                            );
+
+                            return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
+                        }
+                    }
+                }
+
+                None
+            }
+            #[cfg(feature = "testing")]
+            (
+                BatchedFeMessage::Test {
+                    shard: accum_shard,
+                    requests: accum_requests,
+                    ..
+                },
+                BatchedFeMessage::Test {
+                    shard: this_shard,
+                    requests: this_requests,
+                    ..
+                },
+            ) => {
+                assert!(this_requests.len() == 1);
+                if accum_requests.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_requests.len(), max_batch_size.get());
+                    return Some(GetPageBatchBreakReason::BatchFull);
+                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+                let this_batch_key = this_requests[0].req.batch_key;
+                let accum_batch_key = accum_requests[0].req.batch_key;
+                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
+                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
+                    return Some(GetPageBatchBreakReason::NonUniformKey);
+                }
+                None
+            }
+            (_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
+        }
+    }
 }
 
 impl PageServerHandler {
@@ -1047,6 +1161,10 @@ impl PageServerHandler {
                         effective_request_lsn,
                         ctx,
                     }],
+                    // The executor grabs the batch when it becomes idle.
+                    // Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
+                    // default reason for breaking the batch.
+                    batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
                 }
             }
             #[cfg(feature = "testing")]
@@ -1084,118 +1202,59 @@ impl PageServerHandler {
             Err(e) => return Err(Err(e)),
         };
 
-        match (&mut *batch, this_msg) {
-            // something batched already, let's see if we can add this message to the batch
-            (
-                Ok(BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: accum_shard,
-                    pages: accum_pages,
-                }),
-                BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: this_shard,
-                    pages: this_pages,
-                },
-            ) if (|| {
-                assert_eq!(this_pages.len(), 1);
-                if accum_pages.len() >= max_batch_size.get() {
-                    trace!(%max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_pages.len(), max_batch_size.get());
-                    return false;
-                }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!("stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-
-                match batching_strategy {
-                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
-                        if let Some(last_in_batch) = accum_pages.last() {
-                            if last_in_batch.effective_request_lsn
-                                != this_pages[0].effective_request_lsn
-                            {
-                                return false;
-                            }
-                        }
-                    }
-                    PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
-                        // The read path doesn't curently support serving the same page at different LSNs.
-                        // While technically possible, it's uncertain if the complexity is worth it.
-                        // Break the batch if such a case is encountered.
-                        //
-                        // TODO(vlad): Include a metric for batch breaks with a reason label.
-                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
-                            batched.req.rel == this_pages[0].req.rel
-                                && batched.req.blkno == this_pages[0].req.blkno
-                                && batched.effective_request_lsn
-                                    != this_pages[0].effective_request_lsn
-                        });
-
-                        if same_page_different_lsn {
-                            trace!(
-                                rel=%this_pages[0].req.rel,
-                                blkno=%this_pages[0].req.blkno,
-                                lsn=%this_pages[0].effective_request_lsn,
-                                "stopping batching because same page was requested at different LSNs"
-                            );
-                            return false;
-                        }
-                    }
-                }
-
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_pages.extend(this_pages);
-                Ok(())
+        let eligible_batch = match batch {
+            Ok(b) => b,
+            Err(_) => {
+                return Err(Ok(this_msg));
             }
-            #[cfg(feature = "testing")]
-            (
-                Ok(BatchedFeMessage::Test {
-                    shard: accum_shard,
-                    requests: accum_requests,
-                    ..
-                }),
-                BatchedFeMessage::Test {
-                    shard: this_shard,
-                    requests: this_requests,
-                    ..
-                },
-            ) if (|| {
-                assert!(this_requests.len() == 1);
-                if accum_requests.len() >= max_batch_size.get() {
-                    trace!(%max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_requests.len(), max_batch_size.get());
-                    return false;
+        };
+
+        let batch_break =
+            eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
+
+        match batch_break {
+            Some(reason) => {
+                if let BatchedFeMessage::GetPage {
+                    batch_break_reason, ..
+                } = eligible_batch
+                {
+                    *batch_break_reason = reason;
                 }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!("stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                let this_batch_key = this_requests[0].req.batch_key;
-                let accum_batch_key = accum_requests[0].req.batch_key;
-                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
-                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
-                    return false;
-                }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_requests.extend(this_requests);
-                Ok(())
-            }
-            // something batched already but this message is unbatchable
-            (_, this_msg) => {
-                // by default, don't continue batching
+
                 Err(Ok(this_msg))
             }
+            None => {
+                // ok to batch
+                match (eligible_batch, this_msg) {
+                    (
+                        BatchedFeMessage::GetPage {
+                            pages: accum_pages, ..
+                        },
+                        BatchedFeMessage::GetPage {
+                            pages: this_pages, ..
+                        },
+                    ) => {
+                        accum_pages.extend(this_pages);
+                        Ok(())
+                    }
+                    #[cfg(feature = "testing")]
+                    (
+                        BatchedFeMessage::Test {
+                            requests: accum_requests,
+                            ..
+                        },
+                        BatchedFeMessage::Test {
+                            requests: this_requests,
+                            ..
+                        },
+                    ) => {
+                        accum_requests.extend(this_requests);
+                        Ok(())
+                    }
+                    // Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
+                    _ => unreachable!(),
+                }
+            }
         }
     }
 
@@ -1413,7 +1472,12 @@ impl PageServerHandler {
                     span,
                 )
             }
-            BatchedFeMessage::GetPage { span, shard, pages } => {
+            BatchedFeMessage::GetPage {
+                span,
+                shard,
+                pages,
+                batch_break_reason,
+            } => {
                 fail::fail_point!("ps::handle-pagerequest-message::getpage");
                 let (shard, ctx) = upgrade_handle_and_set_context!(shard);
                 (
@@ -1425,6 +1489,7 @@ impl PageServerHandler {
                                 &shard,
                                 pages,
                                 io_concurrency,
+                                batch_break_reason,
                                 &ctx,
                             )
                             .instrument(span.clone())
@@ -2113,13 +2178,14 @@ impl PageServerHandler {
         timeline: &Timeline,
         requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
         io_concurrency: IoConcurrency,
+        batch_break_reason: GetPageBatchBreakReason,
         ctx: &RequestContext,
     ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         timeline
             .query_metrics
-            .observe_getpage_batch_start(requests.len());
+            .observe_getpage_batch_start(requests.len(), batch_break_reason);
 
         // If a page trace is running, submit an event for this request.
         if let Some(page_trace) = timeline.page_trace.load().as_ref() {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index df500544dc..879808b7ba 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -194,6 +194,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_wait_lsn_started_count"),
     counter("pageserver_wait_lsn_finished_count"),
     counter("pageserver_wait_ondemand_download_seconds_sum"),
+    counter("pageserver_page_service_batch_break_reason"),
     *histogram("pageserver_page_service_batch_size"),
     *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 520a019cf5..b17ca772c9 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -1,7 +1,6 @@
 import concurrent.futures
 import dataclasses
 import json
-import re
 import threading
 import time
 from dataclasses import dataclass
@@ -170,6 +169,7 @@ def test_throughput(
         time: float
         pageserver_batch_size_histo_sum: float
         pageserver_batch_size_histo_count: float
+        pageserver_batch_breaks_reason_count: dict[str, int]
         compute_getpage_count: float
         pageserver_cpu_seconds_total: float
 
@@ -183,6 +183,10 @@ def test_throughput(
                 compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
                 - other.pageserver_cpu_seconds_total,
+                pageserver_batch_breaks_reason_count={
+                    reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0)
+                    for reason, count in self.pageserver_batch_breaks_reason_count.items()
+                },
             )
 
         def normalize(self, by) -> "Metrics":
@@ -192,6 +196,10 @@ def test_throughput(
                 pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
                 compute_getpage_count=self.compute_getpage_count / by,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
+                pageserver_batch_breaks_reason_count={
+                    reason: count / by
+                    for reason, count in self.pageserver_batch_breaks_reason_count.items()
+                },
             )
 
     def get_metrics() -> Metrics:
@@ -201,6 +209,20 @@ def test_throughput(
             )
             compute_getpage_count = cur.fetchall()[0][0]
             pageserver_metrics = ps_http.get_metrics()
+            for name, samples in pageserver_metrics.metrics.items():
+                for sample in samples:
+                    log.info(f"{name=} labels={sample.labels} {sample.value}")
+
+            raw_batch_break_reason_count = pageserver_metrics.query_all(
+                "pageserver_page_service_batch_break_reason_total",
+                filter={"timeline_id": str(env.initial_timeline)},
+            )
+
+            batch_break_reason_count = {
+                sample.labels["reason"]: int(sample.value)
+                for sample in raw_batch_break_reason_count
+            }
+
             return Metrics(
                 time=time.time(),
                 pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
@@ -209,6 +231,7 @@ def test_throughput(
                 pageserver_batch_size_histo_count=pageserver_metrics.query_one(
                     "pageserver_page_service_batch_size_count"
                 ).value,
+                pageserver_batch_breaks_reason_count=batch_break_reason_count,
                 compute_getpage_count=compute_getpage_count,
                 pageserver_cpu_seconds_total=pageserver_metrics.query_one(
                     "libmetrics_process_cpu_seconds_highres"
@@ -263,25 +286,6 @@ def test_throughput(
 
     log.info("Results: %s", metrics)
 
-    since_last_start: list[str] = []
-    for line in env.pageserver.logfile.read_text().splitlines():
-        if "git:" in line:
-            since_last_start = []
-        since_last_start.append(line)
-
-    stopping_batching_because_re = re.compile(
-        r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)"
-    )
-    reasons_for_stopping_batching = {}
-    for line in since_last_start:
-        match = stopping_batching_because_re.search(line)
-        if match:
-            if match.group(1) not in reasons_for_stopping_batching:
-                reasons_for_stopping_batching[match.group(1)] = 0
-            reasons_for_stopping_batching[match.group(1)] += 1
-
-    log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching)
-
     #
     # Sanity-checks on the collected data
     #
@@ -295,7 +299,16 @@ def test_throughput(
     #
 
     for metric, value in dataclasses.asdict(metrics).items():
-        zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
+        if metric == "pageserver_batch_breaks_reason_count":
+            assert isinstance(value, dict)
+            for reason, count in value.items():
+                zenbenchmark.record(
+                    f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM
+                )
+        else:
+            zenbenchmark.record(
+                f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM
+            )
 
     zenbenchmark.record(
         "perfmetric.batching_factor",

From 437071888e23b538e60aaef433c20b7d29474fcc Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 14 Apr 2025 08:57:33 -0500
Subject: [PATCH 127/140] Fix logging in nightly physical replication
 benchmarks (#11541)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/performance/test_physical_replication.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index df5419f292..16cdab155a 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -64,8 +64,8 @@ def test_ro_replica_lag(
 
     project = neon_api.create_project(pg_version)
     project_id = project["project"]["id"]
-    log.info("Project ID: {}", project_id)
-    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
+    log.info("Project ID: %s", project_id)
+    log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
     neon_api.wait_for_operation_to_finish(project_id)
     error_occurred = False
     try:
@@ -81,7 +81,7 @@ def test_ro_replica_lag(
             endpoint_type="read_only",
             settings={"pg_settings": {"hot_standby_feedback": "on"}},
         )
-        log.info("Replica endpoint ID: {}", replica["endpoint"]["id"])
+        log.info("Replica endpoint ID: %s", replica["endpoint"]["id"])
         replica_env = master_env.copy()
         replica_env["PGHOST"] = replica["endpoint"]["host"]
         neon_api.wait_for_operation_to_finish(project_id)
@@ -197,8 +197,8 @@ def test_replication_start_stop(
 
     project = neon_api.create_project(pg_version)
     project_id = project["project"]["id"]
-    log.info("Project ID: {}", project_id)
-    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
+    log.info("Project ID: %s", project_id)
+    log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
     neon_api.wait_for_operation_to_finish(project_id)
     try:
         branch_id = project["branch"]["id"]
@@ -215,7 +215,7 @@ def test_replication_start_stop(
                 endpoint_type="read_only",
                 settings={"pg_settings": {"hot_standby_feedback": "on"}},
             )
-            log.info("Replica {} endpoint ID: {}", i + 1, replica["endpoint"]["id"])
+            log.info("Replica %d endpoint ID: %s", i + 1, replica["endpoint"]["id"])
             replicas.append(replica)
             neon_api.wait_for_operation_to_finish(project_id)
 

From e85607eed835224a7d2d41c92166069347a395e0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Apr 2025 15:42:35 +0100
Subject: [PATCH 128/140] tests: remove config tweak allowing old versions to
 start with a batching config (#11560)

## Problem

Pageservers now ignore unknown config fields, so this config tweaking is
no longer needed.

## Summary of changes

Get rid of the hack.

Closes https://github.com/neondatabase/neon/issues/11524
---
 test_runner/fixtures/neon_fixtures.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c846c0950e..3761f29d2f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1322,10 +1322,6 @@ class NeonEnv:
                 log.info("test may use old binaries, ignoring warnings about unknown config items")
                 ps.allowed_errors.append(".*ignoring unknown configuration item.*")
 
-                # Allow old software to start until https://github.com/neondatabase/neon/pull/11275
-                # lands in the compatiblity snapshot.
-                ps_cfg["page_service_pipelining"].pop("batching")
-
             self.pageservers.append(ps)
             cfg["pageservers"].append(ps_cfg)
 

From 057ce115de119a4973dd68b4844884896478ecb0 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 14 Apr 2025 10:51:17 -0400
Subject: [PATCH 129/140] fix(test): allow stale generation errors (1/2)
 (#11531)

## Problem

Part of https://github.com/neondatabase/neon/issues/11486

## Summary of changes

50% of the test instability of `test_create_churn_during_restart` are
due to error message gets changed. Allow the new error message.

Still need to fix other errors due to failure to acquire semaphore in
this or the next patch.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_tenants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index c613a79374..c00f8f4ca5 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -390,6 +390,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # Tenant creation requests which arrive out of order will generate complaints about
     # generation nubmers out of order.
     env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
+    env.pageserver.allowed_errors.append(".*due to stale generation.+")
 
     # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
     # an incomplete attach, or some other problem.  In the field this should be rare,

From 90b706cd96fe5cc40b43035c5d11f8c596d5e783 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Apr 2025 16:13:20 +0100
Subject: [PATCH 130/140] tests: save pageserver metrics at the end of the test
 (#11559)

## Problem

Sometimes it's useful to see the pageserver metrics after a test in
order to debug stuff.
For example, for https://github.com/neondatabase/neon/issues/11465 I'd
like to know
what the remote storage latencies are from the client.

## Summary of changes

When stopping the env, record the pageserver metrics into a file in the
pageserver's workdir.
---
 test_runner/fixtures/neon_fixtures.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3761f29d2f..10bbb7020b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -947,6 +947,8 @@ class NeonEnvBuilder:
                     continue
                 if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
                     continue
+                if FINAL_METRICS_FILE_NAME == test_file.name:
+                    continue
                 log.debug(f"Removing large database {test_file} file")
                 test_file.unlink()
             elif test_entry.is_dir():
@@ -1457,6 +1459,12 @@ class NeonEnv:
                 except Exception as e:
                     metric_errors.append(e)
                     log.error(f"metric validation failed on {pageserver.id}: {e}")
+
+            try:
+                pageserver.snapshot_final_metrics()
+            except Exception as e:
+                log.error(f"metric snapshot failed on {pageserver.id}: {e}")
+
             try:
                 pageserver.stop(immediate=immediate)
             except RuntimeError:
@@ -2972,6 +2980,20 @@ class NeonPageserver(PgProtocol, LogUtils):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
+    def snapshot_final_metrics(self):
+        """
+        Take a snapshot of this pageserver's metrics and stash in its work directory.
+        """
+        if not self.running:
+            log.info(f"Skipping metrics snapshot on pageserver {self.id}, it is not running")
+            return
+
+        metrics = self.http_client().get_metrics_str()
+        metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME
+
+        with open(metrics_snapshot_path, "w") as f:
+            f.write(metrics)
+
     def tenant_attach(
         self,
         tenant_id: TenantId,
@@ -5134,6 +5156,8 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
     r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )
 
+FINAL_METRICS_FILE_NAME: str = "final_metrics.txt"
+
 
 SKIP_DIRS = frozenset(
     (

From 8cce27bedb11a780dd59095437541c26ff6e62dd Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Apr 2025 16:31:32 +0100
Subject: [PATCH 131/140] pageserver: add a randomized read path test (#11519)

## Problem

Every time we make changes to the read path to fix a bug or add a
feature,
we end up adding another incomprehensible test.

## Summary of changes

Add some generic infrastructure for generating a layer map from a type
spec
and use that for a read path test. The test is randomized but uses a
fixed seed
by default. A fuzzing mode is available for confidence building.

See [Notion
page](https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4)
for a diagram of the layer map
used.

Just for fun I tried removing [this
commit](https://github.com/neondatabase/neon/pull/11494/commits/9990199cb4f26a4ab2e64372bd1301f734ddcef5)
from https://github.com/neondatabase/neon/pull/11494
and it caught the bug in the normal mode (no fuzzing required).
---
 pageserver/Cargo.toml             |   2 +
 pageserver/src/tenant.rs          | 528 ++++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs |   2 +-
 3 files changed, 531 insertions(+), 1 deletion(-)

diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 56d97bf8a9..74f3fce6e5 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -10,6 +10,8 @@ default = []
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
 
+fuzz-read-path = ["testing"]
+
 [dependencies]
 anyhow.workspace = true
 arc-swap.workspace = true
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f14c7608fd..0ba70f45b2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5933,12 +5933,20 @@ mod tests {
     use models::CompactLsnRange;
     use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
     use pageserver_api::keyspace::KeySpace;
+    #[cfg(feature = "testing")]
+    use pageserver_api::keyspace::KeySpaceRandomAccum;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     #[cfg(feature = "testing")]
     use pageserver_api::record::NeonWalRecord;
     use pageserver_api::value::Value;
     use pageserver_compaction::helpers::overlaps_with;
+    #[cfg(feature = "testing")]
+    use rand::SeedableRng;
+    #[cfg(feature = "testing")]
+    use rand::rngs::StdRng;
     use rand::{Rng, thread_rng};
+    #[cfg(feature = "testing")]
+    use std::ops::Range;
     use storage_layer::{IoConcurrency, PersistentLayerKey};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -5960,6 +5968,318 @@ mod tests {
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
 
+    #[cfg(feature = "testing")]
+    struct TestTimelineSpecification {
+        start_lsn: Lsn,
+        last_record_lsn: Lsn,
+
+        in_memory_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
+        delta_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
+        image_layers_shape: Vec<(Range<Key>, Lsn)>,
+
+        gap_chance: u8,
+        will_init_chance: u8,
+    }
+
+    #[cfg(feature = "testing")]
+    struct Storage {
+        storage: HashMap<(Key, Lsn), Value>,
+        start_lsn: Lsn,
+    }
+
+    #[cfg(feature = "testing")]
+    impl Storage {
+        fn get(&self, key: Key, lsn: Lsn) -> Bytes {
+            use bytes::BufMut;
+
+            let mut crnt_lsn = lsn;
+            let mut got_base = false;
+
+            let mut acc = Vec::new();
+
+            while crnt_lsn >= self.start_lsn {
+                if let Some(value) = self.storage.get(&(key, crnt_lsn)) {
+                    acc.push(value.clone());
+
+                    match value {
+                        Value::WalRecord(NeonWalRecord::Test { will_init, .. }) => {
+                            if *will_init {
+                                got_base = true;
+                                break;
+                            }
+                        }
+                        Value::Image(_) => {
+                            got_base = true;
+                            break;
+                        }
+                        _ => unreachable!(),
+                    }
+                }
+
+                crnt_lsn = crnt_lsn.checked_sub(1u64).unwrap();
+            }
+
+            assert!(
+                got_base,
+                "Input data was incorrect. No base image for {key}@{lsn}"
+            );
+
+            tracing::debug!("Wal redo depth for {key}@{lsn} is {}", acc.len());
+
+            let mut blob = BytesMut::new();
+            for value in acc.into_iter().rev() {
+                match value {
+                    Value::WalRecord(NeonWalRecord::Test { append, .. }) => {
+                        blob.extend_from_slice(append.as_bytes());
+                    }
+                    Value::Image(img) => {
+                        blob.put(img);
+                    }
+                    _ => unreachable!(),
+                }
+            }
+
+            blob.into()
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    #[allow(clippy::too_many_arguments)]
+    async fn randomize_timeline(
+        tenant: &Arc<Tenant>,
+        new_timeline_id: TimelineId,
+        pg_version: u32,
+        spec: TestTimelineSpecification,
+        random: &mut rand::rngs::StdRng,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(Arc<Timeline>, Storage, Vec<Lsn>)> {
+        let mut storage: HashMap<(Key, Lsn), Value> = HashMap::default();
+        let mut interesting_lsns = vec![spec.last_record_lsn];
+
+        for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
+            let mut lsn = lsn_range.start;
+            while lsn < lsn_range.end {
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+
+                    if gap {
+                        continue;
+                    }
+
+                    let record = if will_init {
+                        Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
+                    } else {
+                        Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
+                    };
+
+                    storage.insert((key, lsn), record);
+
+                    key = key.next();
+                }
+                lsn = Lsn(lsn.0 + 1);
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(lsn_range.start);
+                } else {
+                    let below = lsn_range.start.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn_range.start.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
+            let mut lsn = lsn_range.start;
+            while lsn < lsn_range.end {
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+
+                    if gap {
+                        continue;
+                    }
+
+                    let record = if will_init {
+                        Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
+                    } else {
+                        Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
+                    };
+
+                    storage.insert((key, lsn), record);
+
+                    key = key.next();
+                }
+                lsn = Lsn(lsn.0 + 1);
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(lsn_range.start);
+                } else {
+                    let below = lsn_range.start.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn_range.start.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        for (key_range, lsn) in spec.image_layers_shape.iter() {
+            let mut key = key_range.start;
+            while key < key_range.end {
+                let blob = Bytes::from(format!("[image {key}@{lsn}]"));
+                let record = Value::Image(blob.clone());
+                storage.insert((key, *lsn), record);
+
+                key = key.next();
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(*lsn);
+                } else {
+                    let below = lsn.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        let in_memory_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut lsn = lsn_range.start;
+                while lsn < lsn_range.end {
+                    let mut key = key_range.start;
+                    while key < key_range.end {
+                        if let Some(record) = storage.get(&(key, lsn)) {
+                            data.push((key, lsn, record.clone()));
+                        }
+
+                        key = key.next();
+                    }
+                    lsn = Lsn(lsn.0 + 1);
+                }
+
+                acc.push(InMemoryLayerTestDesc {
+                    data,
+                    lsn_range: lsn_range.clone(),
+                    is_open: false,
+                })
+            }
+
+            acc
+        };
+
+        let delta_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut lsn = lsn_range.start;
+                while lsn < lsn_range.end {
+                    let mut key = key_range.start;
+                    while key < key_range.end {
+                        if let Some(record) = storage.get(&(key, lsn)) {
+                            data.push((key, lsn, record.clone()));
+                        }
+
+                        key = key.next();
+                    }
+                    lsn = Lsn(lsn.0 + 1);
+                }
+
+                acc.push(DeltaLayerTestDesc {
+                    data,
+                    lsn_range: lsn_range.clone(),
+                    key_range: key_range.clone(),
+                })
+            }
+
+            acc
+        };
+
+        let image_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn) in spec.image_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    if let Some(record) = storage.get(&(key, *lsn)) {
+                        let blob = match record {
+                            Value::Image(blob) => blob.clone(),
+                            _ => unreachable!(),
+                        };
+
+                        data.push((key, blob));
+                    }
+
+                    key = key.next();
+                }
+
+                acc.push((*lsn, data));
+            }
+
+            acc
+        };
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                new_timeline_id,
+                spec.start_lsn,
+                pg_version,
+                ctx,
+                in_memory_test_layers,
+                delta_test_layers,
+                image_test_layers,
+                spec.last_record_lsn,
+            )
+            .await?;
+
+        Ok((
+            tline,
+            Storage {
+                storage,
+                start_lsn: spec.start_lsn,
+            },
+            interesting_lsns,
+        ))
+    }
+
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
@@ -10543,6 +10863,214 @@ mod tests {
         Ok(())
     }
 
+    // A randomized read path test. Generates a layer map according to a deterministic
+    // specification. Fills the (key, LSN) space in random manner and then performs
+    // random scattered queries validating the results against in-memory storage.
+    //
+    // See this internal Notion page for a diagram of the layer map:
+    // https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4
+    //
+    // A fuzzing mode is also supported. In this mode, the test will use a random
+    // seed instead of a hardcoded one. Use it in conjunction with `cargo stress`
+    // to run multiple instances in parallel:
+    //
+    // $ RUST_BACKTRACE=1 RUST_LOG=INFO \
+    //   cargo stress --package=pageserver --features=testing,fuzz-read-path --release -- test_read_path
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_read_path() -> anyhow::Result<()> {
+        use rand::seq::SliceRandom;
+
+        let seed = if cfg!(feature = "fuzz-read-path") {
+            let seed: u64 = thread_rng().r#gen();
+            seed
+        } else {
+            // Use a hard-coded seed when not in fuzzing mode.
+            // Note that with the current approach results are not reproducible
+            // accross platforms and Rust releases.
+            const SEED: u64 = 0;
+            SEED
+        };
+
+        let mut random = StdRng::seed_from_u64(seed);
+
+        let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") {
+            const QUERIES: u64 = 5000;
+            let will_init_chance: u8 = random.gen_range(0..=10);
+            let gap_chance: u8 = random.gen_range(0..=50);
+
+            (QUERIES, will_init_chance, gap_chance)
+        } else {
+            const QUERIES: u64 = 1000;
+            const WILL_INIT_CHANCE: u8 = 1;
+            const GAP_CHANCE: u8 = 5;
+
+            (QUERIES, WILL_INIT_CHANCE, GAP_CHANCE)
+        };
+
+        let harness = TenantHarness::create("test_read_path").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        tracing::info!("Using random seed: {seed}");
+        tracing::info!(%will_init_chance, %gap_chance, "Fill params");
+
+        // Define the layer map shape. Note that this part is not randomized.
+
+        const KEY_DIMENSION_SIZE: u32 = 99;
+        let start_key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+        let end_key = start_key.add(KEY_DIMENSION_SIZE);
+        let total_key_range = start_key..end_key;
+        let total_key_range_size = end_key.to_i128() - start_key.to_i128();
+        let total_start_lsn = Lsn(104);
+        let last_record_lsn = Lsn(504);
+
+        assert!(total_key_range_size % 3 == 0);
+
+        let in_memory_layers_shape = vec![
+            (total_key_range.clone(), Lsn(304)..Lsn(400)),
+            (total_key_range.clone(), Lsn(400)..last_record_lsn),
+        ];
+
+        let delta_layers_shape = vec![
+            (
+                start_key..(start_key.add((total_key_range_size / 3) as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+            (
+                (start_key.add((total_key_range_size / 3) as u32))
+                    ..(start_key.add((total_key_range_size * 2 / 3) as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+            (
+                (start_key.add((total_key_range_size * 2 / 3) as u32))
+                    ..(start_key.add(total_key_range_size as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+        ];
+
+        let image_layers_shape = vec![
+            (
+                start_key.add((total_key_range_size * 2 / 3 - 10) as u32)
+                    ..start_key.add((total_key_range_size * 2 / 3 + 10) as u32),
+                Lsn(456),
+            ),
+            (
+                start_key.add((total_key_range_size / 3 - 10) as u32)
+                    ..start_key.add((total_key_range_size / 3 + 10) as u32),
+                Lsn(256),
+            ),
+            (total_key_range.clone(), total_start_lsn),
+        ];
+
+        let specification = TestTimelineSpecification {
+            start_lsn: total_start_lsn,
+            last_record_lsn,
+            in_memory_layers_shape,
+            delta_layers_shape,
+            image_layers_shape,
+            gap_chance,
+            will_init_chance,
+        };
+
+        // Create and randomly fill in the layers according to the specification
+        let (tline, storage, interesting_lsns) = randomize_timeline(
+            &tenant,
+            TIMELINE_ID,
+            DEFAULT_PG_VERSION,
+            specification,
+            &mut random,
+            &ctx,
+        )
+        .await?;
+
+        // Now generate queries based on the interesting lsns that we've collected.
+        //
+        // While there's still room in the query, pick and interesting LSN and a random
+        // key. Then roll the dice to see if the next key should also be included in
+        // the query. When the roll fails, break the "batch" and pick another point in the
+        // (key, LSN) space.
+
+        const PICK_NEXT_CHANCE: u8 = 50;
+        for _ in 0..queries {
+            let query = {
+                let mut keyspaces_at_lsn: HashMap<Lsn, KeySpaceRandomAccum> = HashMap::default();
+                let mut used_keys: HashSet<Key> = HashSet::default();
+
+                while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                    let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
+                    let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));
+
+                    while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                        if used_keys.contains(&selected_key)
+                            || selected_key >= start_key.add(KEY_DIMENSION_SIZE)
+                        {
+                            break;
+                        }
+
+                        keyspaces_at_lsn
+                            .entry(*selected_lsn)
+                            .or_default()
+                            .add_key(selected_key);
+                        used_keys.insert(selected_key);
+
+                        let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE;
+                        if pick_next {
+                            selected_key = selected_key.next();
+                        } else {
+                            break;
+                        }
+                    }
+                }
+
+                VersionedKeySpaceQuery::scattered(
+                    keyspaces_at_lsn
+                        .into_iter()
+                        .map(|(lsn, acc)| (lsn, acc.to_keyspace()))
+                        .collect(),
+                )
+            };
+
+            // Run the query and validate the results
+
+            let results = tline
+                .get_vectored(query.clone(), IoConcurrency::Sequential, &ctx)
+                .await;
+
+            let blobs = match results {
+                Ok(ok) => ok,
+                Err(err) => {
+                    panic!("seed={seed} Error returned for query {query}: {err}");
+                }
+            };
+
+            for (key, key_res) in blobs.into_iter() {
+                match key_res {
+                    Ok(blob) => {
+                        let requested_at_lsn = query.map_key_to_lsn(&key);
+                        let expected = storage.get(key, requested_at_lsn);
+
+                        if blob != expected {
+                            tracing::error!(
+                                "seed={seed} Mismatch for {key}@{requested_at_lsn} from query: {query}"
+                            );
+                        }
+
+                        assert_eq!(blob, expected);
+                    }
+                    Err(err) => {
+                        let requested_at_lsn = query.map_key_to_lsn(&key);
+
+                        panic!(
+                            "seed={seed} Error returned for {key}@{requested_at_lsn} from query {query}: {err}"
+                        );
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
         (
             k1.is_delta,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 204bdb5eee..c27a4b62da 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4026,7 +4026,7 @@ impl VersionedKeySpaceQuery {
     /// Returns LSN for a specific key.
     ///
     /// Invariant: requested key must be part of [`Self::total_keyspace`]
-    fn map_key_to_lsn(&self, key: &Key) -> Lsn {
+    pub(super) fn map_key_to_lsn(&self, key: &Key) -> Lsn {
         match self {
             Self::Uniform { lsn, .. } => *lsn,
             Self::Scattered { keyspaces_at_lsn } => {

From 028a191040352db678d0bbe80c79d58b5e84cdd3 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 14 Apr 2025 16:18:21 -0500
Subject: [PATCH 132/140] Continue with s/spec/config changes (#11574)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs |  7 +++----
 compute_tools/src/compute.rs         | 14 +++++---------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index ea8350e2f5..16fd51d79a 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -139,7 +139,7 @@ fn main() -> Result<()> {
 
     let scenario = failpoint_support::init();
 
-    // For historical reasons, the main thread that processes the spec and launches postgres
+    // For historical reasons, the main thread that processes the config and launches postgres
     // is synchronous, but we always have this tokio runtime available and we "enter" it so
     // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
     // from all parts of compute_ctl.
@@ -155,7 +155,7 @@ fn main() -> Result<()> {
 
     let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
 
-    let cli_spec = get_config(&cli)?;
+    let config = get_config(&cli)?;
 
     let compute_node = ComputeNode::new(
         ComputeNodeParams {
@@ -176,8 +176,7 @@ fn main() -> Result<()> {
             #[cfg(target_os = "linux")]
             vm_monitor_addr: cli.vm_monitor_addr,
         },
-        cli_spec.spec,
-        cli_spec.compute_ctl_config,
+        config,
     )?;
 
     let exit_code = compute_node.run()?;
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 06d5bbb9c5..c7b4bdd240 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,7 +11,7 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
+use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
 use compute_api::spec::{
     ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
 };
@@ -303,11 +303,7 @@ struct StartVmMonitorResult {
 }
 
 impl ComputeNode {
-    pub fn new(
-        params: ComputeNodeParams,
-        cli_spec: Option<ComputeSpec>,
-        compute_ctl_config: ComputeCtlConfig,
-    ) -> Result<Self> {
+    pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
         let connstr = params.connstr.as_str();
         let conn_conf = postgres::config::Config::from_str(connstr)
             .context("cannot build postgres config from connstr")?;
@@ -315,8 +311,8 @@ impl ComputeNode {
             .context("cannot build tokio postgres config from connstr")?;
 
         let mut new_state = ComputeState::new();
-        if let Some(cli_spec) = cli_spec {
-            let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        if let Some(spec) = config.spec {
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
             new_state.pspec = Some(pspec);
         }
 
@@ -327,7 +323,7 @@ impl ComputeNode {
             state: Mutex::new(new_state),
             state_changed: Condvar::new(),
             ext_download_progress: RwLock::new(HashMap::new()),
-            compute_ctl_config,
+            compute_ctl_config: config.compute_ctl_config,
         })
     }
 

From cbd2fc2395a6be13c43a62248d607ee6cada6ee5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 14 Apr 2025 20:21:18 -0500
Subject: [PATCH 133/140] Clean up logs and error messages in compute_ctl
 authorize middleware (#11576)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/middleware/authorize.rs | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index ee3a5cb953..f221752c38 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -11,7 +11,7 @@ use futures::future::BoxFuture;
 use http::{Request, Response, StatusCode};
 use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
 use tower_http::auth::AsyncAuthorizeRequest;
-use tracing::warn;
+use tracing::{debug, warn};
 
 use crate::http::{JsonResponse, extract::RequestId};
 
@@ -92,7 +92,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
             if data.claims.compute_id != compute_id {
                 return Err(JsonResponse::error(
                     StatusCode::UNAUTHORIZED,
-                    "invalid claims in authorization token",
+                    "invalid compute ID in authorization token claims",
                 ));
             }
 
@@ -112,12 +112,14 @@ impl Authorize {
         token: &str,
         validation: &Validation,
     ) -> Result<TokenData<ComputeClaims>> {
+        debug!("verifying token {}", token);
+
         for jwk in jwks.keys.iter() {
             let decoding_key = match DecodingKey::from_jwk(jwk) {
                 Ok(key) => key,
                 Err(e) => {
                     warn!(
-                        "Failed to construct decoding key from {}: {}",
+                        "failed to construct decoding key from {}: {}",
                         jwk.common.key_id.as_ref().unwrap(),
                         e
                     );
@@ -130,7 +132,7 @@ impl Authorize {
                 Ok(data) => return Ok(data),
                 Err(e) => {
                     warn!(
-                        "Failed to decode authorization token using {}: {}",
+                        "failed to decode authorization token using {}: {}",
                         jwk.common.key_id.as_ref().unwrap(),
                         e
                     );
@@ -140,6 +142,6 @@ impl Authorize {
             }
         }
 
-        Err(anyhow!("Failed to verify authorization token"))
+        Err(anyhow!("failed to verify authorization token"))
     }
 }

From 8c77ccfc01bf5e634bf2adab9543a2f2e1b9420e Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 15 Apr 2025 09:25:09 +0200
Subject: [PATCH 134/140] pageserver: log total progress during shard ancestor
 compaction (#11565)

## Problem

Shard ancestor compaction doesn't currently log any global progress
information, only for the current batch.

## Summary of changes

Log the number of layers checked for eligibility this iteration, and the
total number of layers to check. This will indicate how far along the
total shard ancestor compaction has gotten for this iteration.
---
 pageserver/src/tenant/layer_map.rs                        | 2 +-
 .../src/tenant/layer_map/historic_layer_coverage.rs       | 2 +-
 pageserver/src/tenant/timeline/compaction.rs              | 8 ++++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 96cee922ff..23052ccee7 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -714,7 +714,7 @@ impl LayerMap {
         true
     }
 
-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
+    pub fn iter_historic_layers(&self) -> impl ExactSizeIterator<Item = Arc<PersistentLayerDesc>> {
         self.historic.iter()
     }
 
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index b3dc8e56a3..5ccc75fff6 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -504,7 +504,7 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
     }
 
     /// Iterate all the layers
-    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+    pub fn iter(&self) -> impl ExactSizeIterator<Item = Value> {
         // NOTE we can actually perform this without rebuilding,
         //      but it's not necessary for now.
         if !self.buffer.is_empty() {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 91cc8ca10c..2a450795fd 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1273,7 +1273,10 @@ impl Timeline {
         let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
 
         let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map()?.iter_historic_layers() {
+        let layers_iter = layers.layer_map()?.iter_historic_layers();
+        let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
+        for layer_desc in layers_iter {
+            layers_checked += 1;
             let layer = layers.get_from_desc(&layer_desc);
             if layer.metadata().shard.shard_count == self.shard_identity.count {
                 // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -1371,7 +1374,8 @@ impl Timeline {
         }
 
         info!(
-            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
+            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
+                checked {layers_checked}/{layers_total} layers \
                 (latest_gc_cutoff={} pitr_cutoff={})",
             layers_to_rewrite.len(),
             drop_layers.len(),

From 9a6ace9bde2b46dd1b46ea2fe28f01c0b62a6745 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 15 Apr 2025 10:21:44 +0200
Subject: [PATCH 135/140] introduce new runners: unit-perf and use them for
 benchmark jobs (#11409)

## Problem
Benchmarks results are inconsistent on existing small-metal runners

## Summary of changes
Introduce new `unit-perf` runners, and lets run benchmark on them.

The new hardware has slower, but consistent, CPU frequency - if run with
default governor schedutil.
Thus we needed to adjust some testcases' timeouts and add some retry
steps where hard-coded timeouts couldn't be increased without changing
the system under test.
-
[wait_for_last_record_lsn](https://github.com/neondatabase/neon/blob/6592d69a6700a2bd2e9f60c22af138ea0dafbdd0/test_runner/fixtures/pageserver/utils.py#L193)
1000s -> 2000s
-
[test_branch_creation_many](https://github.com/neondatabase/neon/pull/11409/files#diff-2ebfe76f89004d563c7e53e3ca82462e1d85e92e6d5588e8e8f598bbe119e927)
1000s
-
[test_ingest_insert_bulk](https://github.com/neondatabase/neon/pull/11409/files#diff-e90e685be4a87053bc264a68740969e6a8872c8897b8b748d0e8c5f683a68d9f)
- with back throttling disabled compute becomes unresponsive for more
than 60 seconds (PG hard-coded client authentication connection timeout)
-
[test_sharded_ingest](https://github.com/neondatabase/neon/pull/11409/files#diff-e8d870165bd44acb9a6d8350f8640b301c1385a4108430b8d6d659b697e4a3f1)
600s -> 1200s

Right now there are only 2 runners of that class, and if we decide to go
with them, we have to check how much that type of runners we need, so
jobs not stuck with waiting for that type of runners available.

However we now decided to run those runners with governor performance
instead of schedutil.
This achieves almost same performance as previous runners but still
achieves consistent results for same commit

Related issue to activate performance governor on these runners
https://github.com/neondatabase/runner/pull/138

## Verification that it helps

### analyze runtimes on new runner for same commit

Table of runtimes for the same commit on different runners in
[run](https://github.com/neondatabase/neon/actions/runs/14417589789)

| Run | Benchmarks (1) | Benchmarks (2) |Benchmarks (3) |Benchmarks (4)
| Benchmarks (5) |
|--------|--------|---------|---------|---------|---------|
| 1 | 1950.37s | 6374.55s |  3646.15s |  4149.48s |  2330.22s |
| 2 | - | 6369.27s |  3666.65s |  4162.42s |  2329.23s |
| Delta % |  - |  0,07 %  | 0,5 %   |   0,3 % | 0,04 %   |
| with governor performance | 1519.57s |  4131.62s |  - | -  |  - |
| second run gov. perf. | 1513.62s |  4134.67s |  - | -  |  - |
| Delta % |  0,3 % |  0,07 %  |  -  |  - | -   |
| speedup gov. performance | 22 % |  35 % |  - | -  |  - |
| current desktop class hetzner runners (main) | 1487.10s | 3699.67s | -
| - | - |
| slower than desktop class | 2 % |  12 % |  - | -  |  - |


In summary, the runtimes for the same commit on this hardware varies
less than 1 %.

---------

Co-authored-by: BodoBolero <peterbendel@neon.tech>
---
 .github/actionlint.yml                            |  1 +
 .github/workflows/build_and_test.yml              |  2 +-
 explained_queries.sql                             |  0
 test_runner/fixtures/pageserver/utils.py          |  2 +-
 test_runner/performance/test_branch_creation.py   |  3 ++-
 .../performance/test_ingest_insert_bulk.py        | 15 ++++++++++++++-
 test_runner/performance/test_sharded_ingest.py    |  2 +-
 7 files changed, 20 insertions(+), 5 deletions(-)
 create mode 100644 explained_queries.sql

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index edc456d611..1d1b50e458 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -6,6 +6,7 @@ self-hosted-runner:
     - small
     - small-metal
     - small-arm64
+    - unit-perf
     - us-east-2
 config-variables:
   - AWS_ECR_REGION
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 46c8cd6fc9..0e67a22bfc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -284,7 +284,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, small-metal ]
+    runs-on: [ self-hosted, unit-perf ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
diff --git a/explained_queries.sql b/explained_queries.sql
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index bc5076758d..8f5234a2fa 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -199,7 +199,7 @@ def wait_for_last_record_lsn(
     """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
 
     current_lsn = Lsn(0)
-    for i in range(1000):
+    for i in range(2000):
         current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
             return current_lsn
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index b2bd94fae7..a3ee30cda2 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -97,6 +97,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
 
+@pytest.mark.timeout(1000)
 @pytest.mark.parametrize("n_branches", [500, 1024])
 @pytest.mark.parametrize("shape", ["one_ancestor", "random"])
 def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
@@ -205,7 +206,7 @@ def wait_and_record_startup_metrics(
         assert len(matching) == len(expected_labels)
         return matching
 
-    samples = wait_until(metrics_are_filled)
+    samples = wait_until(metrics_are_filled, timeout=60)
 
     for sample in samples:
         phase = sample.labels["phase"]
diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py
index 01836b82e9..ed0a6c70bd 100644
--- a/test_runner/performance/test_ingest_insert_bulk.py
+++ b/test_runner/performance/test_ingest_insert_bulk.py
@@ -52,6 +52,8 @@ def test_ingest_insert_bulk(
         # would compete with Pageserver for bandwidth.
         # neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
 
+    neon_env_builder.pageserver_config_override = "wait_lsn_timeout='600 s'"
+
     neon_env_builder.disable_scrub_on_exit()  # immediate shutdown may leave stray layers
     env = neon_env_builder.init_start()
 
@@ -92,7 +94,18 @@ def test_ingest_insert_bulk(
                     worker_rows = rows / CONCURRENCY
                     pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
 
-        end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+        for attempt in range(5):
+            try:
+                end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+                break
+            except Exception as e:
+                # if we disable backpressure, postgres can become unresponsive for longer than a minute
+                # and new connection attempts time out in postgres after 1 minute
+                # so if this happens we retry new connection
+                log.error(f"Attempt {attempt + 1}/5: Failed to select current wal lsn: {e}")
+            if attempt == 4:
+                log.error("Exceeded maximum retry attempts for selecting current wal lsn")
+                raise
 
         # Wait for pageserver to ingest the WAL.
         client = env.pageserver.http_client()
diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index 94fd54bade..293026d40a 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -13,7 +13,7 @@ from fixtures.neon_fixtures import (
 )
 
 
-@pytest.mark.timeout(600)
+@pytest.mark.timeout(1200)
 @pytest.mark.parametrize("shard_count", [1, 8, 32])
 @pytest.mark.parametrize(
     "wal_receiver_protocol",

From 63a106021ad7e40cdebaff1ac7dee85e52e8c092 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 15 Apr 2025 10:29:36 +0100
Subject: [PATCH 136/140] CI(allure-report-generate): Install allure to /tmp
 (#11579)

## Problem

The `/__w/neon/neon` directory is mounted from host to container and
persists between runs.
Sometimes the next workflow run fails to delete it:

```
Deleting the contents of '/__w/neon/neon'
Error: File was unable to be removed Error: EACCES: permission denied, rmdir '/__w/neon/neon/allure-2.32.2/bin'
```

## Summary of changes
- Download and install allure to `/tmp` which exists in container only

Ref https://github.com/neondatabase/cloud/issues/27186
---
 .github/actions/allure-report-generate/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index b85ca7874d..c27311f24e 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -70,6 +70,7 @@ runs:
 
     - name: Install Allure
       shell: bash -euxo pipefail {0}
+      working-directory: /tmp
       run: |
         if ! which allure; then
           ALLURE_ZIP=allure-${ALLURE_VERSION}.zip

From 5be94e28c4d4822847a22887dbca1b7bbda55c61 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 15 Apr 2025 13:00:25 +0200
Subject: [PATCH 137/140] Update the documentation of the cloud regress test
 (#11539)

## Problem
The information in the README.md contained errors, and some information
was missing.
## Summary of changes
Found errors are fixed, and new information is added.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 test_runner/cloud_regress/README.md | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/test_runner/cloud_regress/README.md b/test_runner/cloud_regress/README.md
index 9c460e2764..f341f3c818 100644
--- a/test_runner/cloud_regress/README.md
+++ b/test_runner/cloud_regress/README.md
@@ -3,19 +3,35 @@
 * Create a Neon project on staging.
 * Grant the superuser privileges to the DB user.
 * (Optional) create a branch for testing
-* Configure the endpoint by updating the control-plane database with the following settings:
+* Add the following settings to the `pg_settings` section of the default endpoint configuration for the project using the admin interface:
   * `Timeone`: `America/Los_Angeles`
   * `DateStyle`: `Postgres,MDY`
   * `compute_query_id`: `off`
+* Add the following section to the project configuration:
+```json
+"preload_libraries": {
+    "use_defaults": false,
+    "enabled_libraries": []
+  }
+```
 * Checkout the actual `Neon` sources
 * Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17:
 ```bash
 $ cd vendor/postgres-v17
 $ patch -p1 <../../compute/patches/cloud_regress_pg17.patch
 ```
+* Set the environment variables (please modify according your configuration):
+```bash
+$ export DEFAULT_PG_VERSION=17
+$ export BUILD_TYPE=release
+```
+* Build the Neon binaries see [README.md](../../README.md)
 * Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project.
-* Set the environment variable `PG_VERSION` to the version of your project.
+* Update poetry, run
+```bash
+$ scripts/pysync
+```
 * Run 
 ```bash
-$ pytest -m remote_cluster -k cloud_regress
+$ scripts/pytest -m remote_cluster -k cloud_regress
 ```
\ No newline at end of file

From 19bea5fd0c048f7867cb41f5a1b705951bc1e08b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 15 Apr 2025 12:23:41 +0100
Subject: [PATCH 138/140] CI: do not wait for tests to trigger deploy job
 (#11548)

## Problem

There is too much delay between merging a PR into `main` and deploying
the changes to staging

## Summary of changes
- Trigger `deploy` job without waiting for `build-and-test-locally` job
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0e67a22bfc..80c4511b36 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1271,7 +1271,7 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
     if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
     permissions:

From a4ea7d6194c221e796f89dad563bded89e8bd94f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 15 Apr 2025 09:58:32 -0400
Subject: [PATCH 139/140] fix(pageserver): gc-compaction verification false
 failure (#11564)

## Problem

https://github.com/neondatabase/neon/pull/11515 introduced a bug that
some key history cannot be verified.

If a key only exists above the horizon, the verification will fail for
its first occurrence because the history does not exist at that point.

As gc-compaction skips a key range whenever an error occurs, it might be
doing some wasted work in staging/prod now. But I'm not planning a
hotfix this week as the bug doesn't affect correctness/performance.

## Summary of changes

Allow keys with only above horizon history in the verification.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 31 +++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2a450795fd..3d5f11aeb9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -819,7 +819,15 @@ impl KeyHistoryRetention {
             base_img: &Option<(Lsn, &Bytes)>,
             history: &[(Lsn, &NeonWalRecord)],
             tline: &Arc<Timeline>,
+            skip_empty: bool,
         ) -> anyhow::Result<()> {
+            if base_img.is_none() && history.is_empty() {
+                if skip_empty {
+                    return Ok(());
+                }
+                anyhow::bail!("verification failed: key {} has no history at {}", key, lsn);
+            };
+
             let mut records = history
                 .iter()
                 .map(|(lsn, val)| (*lsn, (*val).clone()))
@@ -860,17 +868,12 @@ impl KeyHistoryRetention {
             if *retain_lsn >= min_lsn {
                 // Only verify after the key appears in the full history for the first time.
 
-                if base_img.is_none() && history.is_empty() {
-                    anyhow::bail!(
-                        "verificatoin failed: key {} has no history at {}",
-                        key,
-                        retain_lsn
-                    );
-                };
                 // We don't modify history: in theory, we could replace the history with a single
                 // image as in `generate_key_retention` to make redos at later LSNs faster. But we
                 // want to verify everything as if they are read from the real layer map.
-                collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?;
+                collect_and_verify(key, *retain_lsn, &base_img, &history, tline, false)
+                    .await
+                    .context("below horizon retain_lsn")?;
             }
         }
 
@@ -878,13 +881,17 @@ impl KeyHistoryRetention {
             match val {
                 Value::Image(img) => {
                     // Above the GC horizon, we verify every time we see an image.
-                    collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
+                    collect_and_verify(key, *lsn, &base_img, &history, tline, true)
+                        .await
+                        .context("above horizon full image")?;
                     base_img = Some((*lsn, img));
                     history.clear();
                 }
                 Value::WalRecord(rec) if val.will_init() => {
                     // Above the GC horizon, we verify every time we see an init record.
-                    collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
+                    collect_and_verify(key, *lsn, &base_img, &history, tline, true)
+                        .await
+                        .context("above horizon init record")?;
                     base_img = None;
                     history.clear();
                     history.push((*lsn, rec));
@@ -895,7 +902,9 @@ impl KeyHistoryRetention {
             }
         }
         // Ensure the latest record is readable.
-        collect_and_verify(key, max_lsn, &base_img, &history, tline).await?;
+        collect_and_verify(key, max_lsn, &base_img, &history, tline, false)
+            .await
+            .context("latest record")?;
         Ok(())
     }
 }

From e31455d93694545998aa3a173c866ae965b91190 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 15 Apr 2025 16:06:01 +0200
Subject: [PATCH 140/140] Add the tests for the extensions `pg_jsonschema` and
 `pg_session_jwt` (#11323)

## Problem
`pg_jsonschema` and `pg_session_jwt` are not yet covered by tests
## Summary of changes
Added the tests for these extensions.
---
 .../ext-src/pg_jsonschema-src/Makefile        |  8 ++
 .../expected/jsonschema_edge_cases.out        | 87 +++++++++++++++++++
 .../expected/jsonschema_valid_api.out         | 65 ++++++++++++++
 .../sql/jsonschema_edge_cases.sql             | 66 ++++++++++++++
 .../sql/jsonschema_valid_api.sql              | 48 ++++++++++
 .../ext-src/pg_session_jwt-src/Makefile       |  9 ++
 .../expected/basic_functions.out              | 35 ++++++++
 .../sql/basic_functions.sql                   | 19 ++++
 8 files changed, 337 insertions(+)
 create mode 100644 docker-compose/ext-src/pg_jsonschema-src/Makefile
 create mode 100644 docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out
 create mode 100644 docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out
 create mode 100644 docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql
 create mode 100644 docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql
 create mode 100644 docker-compose/ext-src/pg_session_jwt-src/Makefile
 create mode 100644 docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
 create mode 100644 docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql

diff --git a/docker-compose/ext-src/pg_jsonschema-src/Makefile b/docker-compose/ext-src/pg_jsonschema-src/Makefile
new file mode 100644
index 0000000000..d79364d8b5
--- /dev/null
+++ b/docker-compose/ext-src/pg_jsonschema-src/Makefile
@@ -0,0 +1,8 @@
+EXTENSION = pg_jsonschema
+DATA = pg_jsonschema--1.0.sql
+REGRESS = jsonschema_valid_api  jsonschema_edge_cases
+REGRESS_OPTS = --load-extension=pg_jsonschema
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
diff --git a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out
new file mode 100644
index 0000000000..f4089bfb13
--- /dev/null
+++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out
@@ -0,0 +1,87 @@
+-- Schema with enums, nulls, extra properties disallowed
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+    "email": { "type": ["string", "null"], "format": "email" }
+  },
+  "required": ["status"],
+  "additionalProperties": false
+}'::json);
+ jsonschema_is_valid 
+---------------------
+ t
+(1 row)
+
+-- Valid enum and null email
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": null}'::json
+);
+ jsonschema_validation_errors 
+------------------------------
+ {}
+(1 row)
+
+-- Invalid enum value
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "disabled", "email": null}'::json
+);
+                     jsonschema_validation_errors                     
+----------------------------------------------------------------------
+ {"\"disabled\" is not one of [\"active\",\"inactive\",\"pending\"]"}
+(1 row)
+
+-- Invalid email format (assuming format is validated)
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": "not-an-email"}'::json
+);
+      jsonschema_validation_errors       
+-----------------------------------------
+ {"\"not-an-email\" is not a \"email\""}
+(1 row)
+
+-- Extra property not allowed
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "extra": "should not be here"}'::json
+);
+                    jsonschema_validation_errors                    
+--------------------------------------------------------------------
+ {"Additional properties are not allowed ('extra' was unexpected)"}
+(1 row)
+
diff --git a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out
new file mode 100644
index 0000000000..73f0a562e7
--- /dev/null
+++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out
@@ -0,0 +1,65 @@
+-- Define schema
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "username": { "type": "string" },
+    "age": { "type": "integer" }
+  },
+  "required": ["username"]
+}'::json);
+ jsonschema_is_valid 
+---------------------
+ t
+(1 row)
+
+-- Valid instance
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "alice", "age": 25}'::json
+);
+ jsonschema_validation_errors 
+------------------------------
+ {}
+(1 row)
+
+-- Invalid instance: missing required "username"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"age": 25}'::json
+);
+      jsonschema_validation_errors       
+-----------------------------------------
+ {"\"username\" is a required property"}
+(1 row)
+
+-- Invalid instance: wrong type for "age"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "bob", "age": "twenty"}'::json
+);
+       jsonschema_validation_errors        
+-------------------------------------------
+ {"\"twenty\" is not of type \"integer\""}
+(1 row)
+
diff --git a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql
new file mode 100644
index 0000000000..edad8cca16
--- /dev/null
+++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql
@@ -0,0 +1,66 @@
+-- Schema with enums, nulls, extra properties disallowed
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+    "email": { "type": ["string", "null"], "format": "email" }
+  },
+  "required": ["status"],
+  "additionalProperties": false
+}'::json);
+
+-- Valid enum and null email
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": null}'::json
+);
+
+-- Invalid enum value
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "disabled", "email": null}'::json
+);
+
+-- Invalid email format (assuming format is validated)
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": "not-an-email"}'::json
+);
+
+-- Extra property not allowed
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "extra": "should not be here"}'::json
+);
diff --git a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql
new file mode 100644
index 0000000000..44539ed6ce
--- /dev/null
+++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql
@@ -0,0 +1,48 @@
+-- Define schema
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "username": { "type": "string" },
+    "age": { "type": "integer" }
+  },
+  "required": ["username"]
+}'::json);
+
+-- Valid instance
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "alice", "age": 25}'::json
+);
+
+-- Invalid instance: missing required "username"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"age": 25}'::json
+);
+
+-- Invalid instance: wrong type for "age"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "bob", "age": "twenty"}'::json
+);
diff --git a/docker-compose/ext-src/pg_session_jwt-src/Makefile b/docker-compose/ext-src/pg_session_jwt-src/Makefile
new file mode 100644
index 0000000000..c61c9777ad
--- /dev/null
+++ b/docker-compose/ext-src/pg_session_jwt-src/Makefile
@@ -0,0 +1,9 @@
+EXTENSION = pg_session_jwt
+
+REGRESS = basic_functions
+REGRESS_OPTS = --load-extension=$(EXTENSION)
+export PGOPTIONS = -c pg_session_jwt.jwk={"crv":"Ed25519","kty":"OKP","x":"R_Abz-63zJ00l-IraL5fQhwkhGVZCSooQFV5ntC3C7M"}
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
new file mode 100644
index 0000000000..ca54864ecd
--- /dev/null
+++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
@@ -0,0 +1,35 @@
+-- Basic functionality tests for pg_session_jwt
+-- Test auth.init() function
+SELECT auth.init();
+ init 
+------
+ 
+(1 row)
+
+-- Test an invalid JWT
+SELECT auth.jwt_session_init('INVALID-JWT');
+ERROR:  invalid JWT encoding
+-- Test creating a session with an expired JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
+ERROR:  Token used after it has expired
+-- Test creating a session with a valid JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
+ jwt_session_init 
+------------------
+ 
+(1 row)
+
+-- Test auth.session() function
+SELECT auth.session();
+                                 session                                 
+-------------------------------------------------------------------------
+ {"exp": 4896164252, "iat": 1742564252, "jti": 434343, "sub": "user123"}
+(1 row)
+
+-- Test auth.user_id() function
+SELECT auth.user_id() AS user_id;
+ user_id 
+---------
+ user123
+(1 row)
+
diff --git a/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql b/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql
new file mode 100644
index 0000000000..6c1ab90c0c
--- /dev/null
+++ b/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql
@@ -0,0 +1,19 @@
+-- Basic functionality tests for pg_session_jwt
+
+-- Test auth.init() function
+SELECT auth.init();
+
+-- Test an invalid JWT
+SELECT auth.jwt_session_init('INVALID-JWT');
+
+-- Test creating a session with an expired JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
+
+-- Test creating a session with a valid JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
+
+-- Test auth.session() function
+SELECT auth.session();
+
+-- Test auth.user_id() function
+SELECT auth.user_id() AS user_id;
\ No newline at end of file