From 1a29f5672a0fe633bfdf81ec1edef805c1a2d86f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 28 Apr 2025 10:03:10 +0100
Subject: [PATCH 01/76] CI(check-macos-build): trigger workflow automatically
 for PRs (#11706)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

- if-conditions for the `check-macos-build` workflow don't trigger it on
PRs with relevant changes (in Rust code or Postgres submodules).
- Jobs in the workflow depend on the presence of a cache, which is not
guaranteed.

## Summary of changes
- Fix if-conditions
- Use artifacts on top of cache whenever the workflow depends on it —
the cache might not be available
---
 .github/workflows/build-macos.yml       | 133 ++++++++++++------------
 .github/workflows/neon_extra_builds.yml |   4 -
 2 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index ecd135cc3d..0f7fa3e813 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -34,11 +34,10 @@ permissions:
 jobs:
   build-pgxn:
     if: |
-      (inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-        github.ref_name == 'main'
-      )
+      inputs.pg_versions != '[]' || inputs.rebuild_everything ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
     timeout-minutes: 30
     runs-on: macos-15
     strategy:
@@ -100,13 +99,21 @@ jobs:
         run: |
           make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
 
+      - name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: pg_install--${{ matrix.postgres-version }}
+          path: pg_install/${{ matrix.postgres-version }}
+          # The artifact is supposed to be used by the next job in the same workflow,
+          # so there’s no need to store it for too long.
+          retention-days: 1
+
   build-walproposer-lib:
     if: |
-      (inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-        github.ref_name == 'main'
-      )
+      inputs.pg_versions != '[]' || inputs.rebuild_everything ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
     timeout-minutes: 30
     runs-on: macos-15
     needs: [build-pgxn]
@@ -127,12 +134,11 @@ jobs:
         id: pg_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
 
-      - name: Cache postgres v17 build
-        id: cache_pg
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+      - name: Download "pg_install/v17" artifact
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
         with:
+          name: pg_install--v17
           path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache walproposer-lib
         id: cache_walproposer_lib
@@ -163,13 +169,21 @@ jobs:
         run:
           make walproposer-lib -j$(sysctl -n hw.ncpu)
 
+      - name: Upload "pg_install/build/walproposer-lib" artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: pg_install--build--walproposer-lib
+          path: pg_install/build/walproposer-lib
+          # The artifact is supposed to be used by the next job in the same workflow,
+          # so there’s no need to store it for too long.
+          retention-days: 1
+
   cargo-build:
     if: |
-      (inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && (
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-        github.ref_name == 'main'
-      )
+      inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
     timeout-minutes: 30
     runs-on: macos-15
     needs: [build-pgxn, build-walproposer-lib]
@@ -188,45 +202,43 @@ jobs:
         with:
           submodules: true
 
-      - name: Set pg v14 for caching
-        id: pg_rev_v14
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}"
-      - name: Set pg v15 for caching
-        id: pg_rev_v15
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}"
-      - name: Set pg v16 for caching
-        id: pg_rev_v16
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}"
-      - name: Set pg v17 for caching
-        id: pg_rev_v17
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
-
-      - name: Cache postgres v14 build
-        id: cache_pg
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+      - name: Download "pg_install/v14" artifact
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
         with:
+          name: pg_install--v14
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-      - name: Cache postgres v15 build
-        id: cache_pg_v15
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-      - name: Cache postgres v16 build
-        id: cache_pg_v16
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-      - name: Cache postgres v17 build
-        id: cache_pg_v17
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
-        with:
-          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
-      - name: Cache cargo deps (only for v17)
+      - name: Download "pg_install/v15" artifact
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: pg_install--v15
+          path: pg_install/v15
+
+      - name: Download "pg_install/v16" artifact
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: pg_install--v16
+          path: pg_install/v16
+
+      - name: Download "pg_install/v17" artifact
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: pg_install--v17
+          path: pg_install/v17
+
+      - name: Download "pg_install/build/walproposer-lib" artifact
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        with:
+          name: pg_install--build--walproposer-lib
+          path: pg_install/build/walproposer-lib
+
+      # `actions/download-artifact` doesn't preserve permissions:
+      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
+      - name: Make pg_install/v*/bin/* executable
+        run: |
+          chmod +x pg_install/v*/bin/*
+
+      - name: Cache cargo deps
         uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
         with:
           path: |
@@ -236,13 +248,6 @@ jobs:
             target
           key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
 
-      - name: Cache walproposer-lib
-        id: cache_walproposer_lib
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
-        with:
-          path: pg_install/build/walproposer-lib
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
       - name: Install build dependencies
         run: |
           brew install flex bison openssl protobuf icu4c
@@ -252,8 +257,8 @@ jobs:
           echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
-      - name: Run cargo build (only for v17)
+      - name: Run cargo build
         run: cargo build --all --release -j$(sysctl -n hw.ncpu)
 
-      - name: Check that no warnings are produced (only for v17)
+      - name: Check that no warnings are produced
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 11aa4d2c94..79467c8f95 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -69,10 +69,6 @@ jobs:
 
   check-macos-build:
     needs: [ check-permissions, files-changed ]
-    if: |
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-      github.ref_name == 'main'
     uses: ./.github/workflows/build-macos.yml
     with:
       pg_versions: ${{ needs.files-changed.outputs.postgres_changes }}

From 32393b43938db01b8c683314b255897c82082576 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 28 Apr 2025 12:29:44 +0100
Subject: [PATCH 02/76] pg-sni-router: support compute TLS on different port
 (#11732)

## Problem

pg-sni-router isn't aware of compute TLS

## Summary of changes

If connections come in on port 4433, we require TLS to compute from
pg-sni-router
---
 proxy/src/binary/pg_sni_router.rs | 103 ++++++++++++++++++++++++++----
 1 file changed, 92 insertions(+), 11 deletions(-)

diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
index aef5c9383e..19be058ac3 100644
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -7,13 +7,14 @@ use std::{net::SocketAddr, sync::Arc};
 
 use anyhow::{Context, anyhow, bail, ensure};
 use clap::Arg;
-use futures::TryFutureExt;
 use futures::future::Either;
+use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use rustls::crypto::ring;
-use rustls::pki_types::PrivateKeyDer;
-use tokio::io::{AsyncRead, AsyncWrite};
+use rustls::pki_types::{DnsName, PrivateKeyDer};
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio::net::TcpListener;
+use tokio_rustls::TlsConnector;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, error, info};
 use utils::project_git_version;
@@ -38,6 +39,12 @@ fn cli() -> clap::Command {
                 .help("listen for incoming client connections on ip:port")
                 .default_value("127.0.0.1:4432"),
         )
+        .arg(
+            Arg::new("listen-tls")
+                .long("listen-tls")
+                .help("listen for incoming client connections on ip:port, requiring TLS to compute")
+                .default_value("127.0.0.1:4433"),
+        )
         .arg(
             Arg::new("tls-key")
                 .short('k')
@@ -122,31 +129,58 @@ pub async fn run() -> anyhow::Result<()> {
         _ => bail!("tls-key and tls-cert must be specified"),
     };
 
+    let compute_tls_config =
+        Arc::new(crate::tls::client_config::compute_client_config_with_root_certs()?);
+
     // Start listening for incoming client connections
     let proxy_address: SocketAddr = args
         .get_one::<String>("listen")
-        .expect("string argument defined")
+        .expect("listen argument defined")
         .parse()?;
+    let proxy_address_compute_tls: SocketAddr = args
+        .get_one::<String>("listen-tls")
+        .expect("listen-tls argument defined")
+        .parse()?;
+
     info!("Starting sni router on {proxy_address}");
+    info!("Starting sni router on {proxy_address_compute_tls}");
     let proxy_listener = TcpListener::bind(proxy_address).await?;
+    let proxy_listener_compute_tls = TcpListener::bind(proxy_address_compute_tls).await?;
 
     let cancellation_token = CancellationToken::new();
+    let dest = Arc::new(destination);
 
     let main = tokio::spawn(task_main(
-        Arc::new(destination),
-        tls_config,
+        dest.clone(),
+        tls_config.clone(),
+        None,
         tls_server_end_point,
         proxy_listener,
         cancellation_token.clone(),
-    ));
+    ))
+    .map(crate::error::flatten_err);
+
+    let main_tls = tokio::spawn(task_main(
+        dest,
+        tls_config,
+        Some(compute_tls_config),
+        tls_server_end_point,
+        proxy_listener_compute_tls,
+        cancellation_token.clone(),
+    ))
+    .map(crate::error::flatten_err);
     let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {}));
 
     // the signal task cant ever succeed.
     // the main task can error, or can succeed on cancellation.
     // we want to immediately exit on either of these cases
+    let main = futures::future::try_join(main, main_tls);
     let signal = match futures::future::select(signals_task, main).await {
         Either::Left((res, _)) => crate::error::flatten_err(res)?,
-        Either::Right((res, _)) => return crate::error::flatten_err(res),
+        Either::Right((res, _)) => {
+            res?;
+            return Ok(());
+        }
     };
 
     // maintenance tasks return `Infallible` success values, this is an impossible value
@@ -157,6 +191,7 @@ pub async fn run() -> anyhow::Result<()> {
 async fn task_main(
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
+    compute_tls_config: Option<Arc<rustls::ClientConfig>>,
     tls_server_end_point: TlsServerEndPoint,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
@@ -175,6 +210,7 @@ async fn task_main(
         let session_id = uuid::Uuid::new_v4();
         let tls_config = Arc::clone(&tls_config);
         let dest_suffix = Arc::clone(&dest_suffix);
+        let compute_tls_config = compute_tls_config.clone();
 
         connections.spawn(
             async move {
@@ -192,7 +228,15 @@ async fn task_main(
                     crate::metrics::Protocol::SniRouter,
                     "sni",
                 );
-                handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
+                handle_client(
+                    ctx,
+                    dest_suffix,
+                    tls_config,
+                    compute_tls_config,
+                    tls_server_end_point,
+                    socket,
+                )
+                .await
             }
             .unwrap_or_else(|e| {
                 // Acknowledge that the task has finished with an error.
@@ -268,6 +312,7 @@ async fn handle_client(
     ctx: RequestContext,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
+    compute_tls_config: Option<Arc<rustls::ClientConfig>>,
     tls_server_end_point: TlsServerEndPoint,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
@@ -288,7 +333,33 @@ async fn handle_client(
 
     info!("destination: {}", destination);
 
-    let mut client = tokio::net::TcpStream::connect(destination).await?;
+    let mut client = tokio::net::TcpStream::connect(&destination).await?;
+
+    let client = if let Some(compute_tls_config) = compute_tls_config {
+        info!("upgrading TLS");
+
+        // send SslRequest
+        client
+            .write_all(b"\x00\x00\x00\x08\x04\xd2\x16\x2f")
+            .await?;
+
+        // wait for S/N respons
+        let mut resp = b'N';
+        client.read_exact(std::slice::from_mut(&mut resp)).await?;
+
+        // error if not S
+        ensure!(resp == b'S', "compute refused TLS");
+
+        // upgrade to TLS.
+        let domain = DnsName::try_from(destination)?;
+        let domain = rustls::pki_types::ServerName::DnsName(domain);
+        let client = TlsConnector::from(compute_tls_config)
+            .connect(domain, client)
+            .await?;
+        Connection::Tls(client)
+    } else {
+        Connection::Raw(client)
+    };
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
@@ -297,9 +368,19 @@ async fn handle_client(
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
 
-    match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await {
+    let res = match client {
+        Connection::Raw(mut c) => copy_bidirectional_client_compute(&mut tls_stream, &mut c).await,
+        Connection::Tls(mut c) => copy_bidirectional_client_compute(&mut tls_stream, &mut c).await,
+    };
+
+    match res {
         Ok(_) => Ok(()),
         Err(ErrorSource::Client(err)) => Err(err).context("client"),
         Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
     }
 }
+
+enum Connection {
+    Raw(tokio::net::TcpStream),
+    Tls(tokio_rustls::client::TlsStream<tokio::net::TcpStream>),
+}

From 606f14034ee591c3c7832c39a5628480935f05a6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 28 Apr 2025 13:52:44 +0200
Subject: [PATCH 03/76] pageserver: improve `pageserver_smgr_query_seconds`
 buckets (#11680)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

The `pageserver_smgr_query_seconds` buckets are too coarse, using powers
of 10: 1 µs, 10 µs, 100 µs, 1 ms, 10 ms, 100 ms, 1 s, 10 s, 100 s. This
is one of our most crucial latency metrics, and needs better resolution.

Touches #11594.

## Summary of changes

This patch uses buckets with better resolution around 1 ms (the typical
latency):

* 0.6 ms
* 1 ms
* 3 ms
* 6 ms
* 10 ms
* 30 ms
* 100 ms
* 1 s
* 3 s

These will be the same as the compute's `compute_getpage_wait_seconds`,
to make them comparable across the compute and Pageserver:
https://github.com/neondatabase/flux-fleet/pull/579. We sacrifice
buckets above 3 s, since these can already be considered "too slow".

This does not change the previously used `CRITICAL_OP_BUCKETS`, which is
also used for other operations on different timescales (e.g. LSN waits).
We should consider replacing this with more appropriate buckets for
specific operations, since it covers a large span with low resolution.
---
 pageserver/src/metrics.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b16970c911..9a6c3f2378 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1774,8 +1774,12 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
     .expect("failed to define a metric")
 });
 
-// Alias so all histograms recording per-timeline smgr timings use the same buckets.
-static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+/// Per-timeline smgr histogram buckets should be the same as the compute buckets, such that the
+/// metrics are comparable across compute and Pageserver. See also:
+/// <https://github.com/neondatabase/neon/blob/1a87975d956a8ad17ec8b85da32a137ec4893fcc/pgxn/neon/neon_perf_counters.h#L18-L27>
+/// <https://github.com/neondatabase/flux-fleet/blob/556182a939edda87ff1d85a6b02e5cec901e0e9e/apps/base/compute-metrics/scrape-compute-sql-exporter.yaml#L29-L35>
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] =
+    &[0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.1, 1.0, 3.0];
 
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(

From 60b9fb1baf4cba732cff4792b9a97d755794b7e2 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 28 Apr 2025 15:16:29 +0300
Subject: [PATCH 04/76] Ignore unlogged LSNs in set last written LSN (#11743)

## Problem

See https://github.com/neondatabase/neon/issues/11718
and https://neondb.slack.com/archives/C033RQ5SPDH/p1745122797538509

GIST other indexes performing "unlogged build" are using so called fake
LSNs - not a real LSN, but something like 0/1. Been stored in lwlsn
cache they cause incorrect lookup at PS.

## Summary of changes

Do not store fake LSNs in LwLSN hash.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon_lwlsncache.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c
index 59222eb855..6959da55cb 100644
--- a/pgxn/neon/neon_lwlsncache.c
+++ b/pgxn/neon/neon_lwlsncache.c
@@ -396,7 +396,7 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn,
 XLogRecPtr
 neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
 {
-	if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
+	if (lsn < FirstNormalUnloggedLSN || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
 		return lsn;
 
 	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
@@ -505,4 +505,5 @@ neon_set_lwlsn_db(XLogRecPtr lsn)
 {
 	NRelFileInfo dummyNode = {InvalidOid, InvalidOid, InvalidOid};
 	return neon_set_lwlsn_block(lsn, dummyNode, MAIN_FORKNUM, 0);
-}
\ No newline at end of file
+}
+

From 2b1d2a55d69c8f3157cf4566ec49b85d503a0c28 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 28 Apr 2025 13:44:28 +0100
Subject: [PATCH 05/76] CI: fix typo oicd -> oidc (#11747)

## Problem

It's OIDC (OpenID Connect), not OICD

## Summary of changes
- Rename actions input `aws-oicd-role-arn` -> `aws-oidc-role-arn`
---
 .../actions/allure-report-generate/action.yml |  4 +-
 .../actions/allure-report-store/action.yml    |  4 +-
 .github/actions/download/action.yml           |  4 +-
 .../actions/run-python-test-set/action.yml    | 14 ++---
 .github/actions/save-coverage-data/action.yml |  4 +-
 .github/actions/upload/action.yml             |  4 +-
 .../workflows/_benchmarking_preparation.yml   |  2 +-
 .github/workflows/_build-and-test-locally.yml |  4 +-
 .github/workflows/benchmarking.yml            | 58 +++++++++----------
 .../workflows/build_and_run_selected_test.yml |  2 +-
 .github/workflows/build_and_test.yml          |  8 +--
 .../build_and_test_with_sanitizers.yml        |  2 +-
 .github/workflows/cloud-regress.yml           |  6 +-
 .github/workflows/ingest_benchmark.yml        | 10 ++--
 .github/workflows/large_oltp_benchmark.yml    | 18 +++---
 .github/workflows/periodic_pagebench.yml      |  2 +-
 .github/workflows/pg-clients.yml              | 12 ++--
 .github/workflows/random-ops-test.yml         |  6 +-
 18 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index c27311f24e..fca6a0cfa5 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -7,7 +7,7 @@ inputs:
     type: boolean
     required: false
     default: false
-  aws-oicd-role-arn:
+  aws-oidc-role-arn:
     description: 'OIDC role arn to interract with S3'
     required: true
 
@@ -88,7 +88,7 @@ runs:
       if: ${{ !cancelled() }}
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
+        role-to-assume: ${{ inputs.aws-oidc-role-arn }}
         role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
 
     # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 687bfd49af..2844344739 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -8,7 +8,7 @@ inputs:
   unique-key:
     description: 'string to distinguish different results in the same run'
     required: true
-  aws-oicd-role-arn:
+  aws-oidc-role-arn:
     description: 'OIDC role arn to interract with S3'
     required: true
 
@@ -39,7 +39,7 @@ runs:
       if: ${{ !cancelled() }}
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
+        role-to-assume: ${{ inputs.aws-oidc-role-arn }}
         role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
 
     - name: Upload test results
diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml
index 14b2ef8eac..d3829618da 100644
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -15,7 +15,7 @@ inputs:
   prefix:
     description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
     required: false
-  aws-oicd-role-arn:
+  aws-oidc-role-arn:
     description: 'OIDC role arn to interract with S3'
     required: true
 
@@ -25,7 +25,7 @@ runs:
     - uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
+        role-to-assume: ${{ inputs.aws-oidc-role-arn }}
         role-duration-seconds: 3600
 
     - name: Download artifact
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 1c65244ef4..6f2b48444a 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -53,7 +53,7 @@ inputs:
     description: 'benchmark durations JSON'
     required: false
     default: '{}'
-  aws-oicd-role-arn:
+  aws-oidc-role-arn:
     description: 'OIDC role arn to interract with S3'
     required: true
 
@@ -66,7 +66,7 @@ runs:
       with:
         name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
         path: /tmp/neon
-        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
+        aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }}
 
     - name: Download Neon binaries for the previous release
       if: inputs.build_type != 'remote'
@@ -75,7 +75,7 @@ runs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon-previous
         prefix: latest
-        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
+        aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }}
 
     - name: Download compatibility snapshot
       if: inputs.build_type != 'remote'
@@ -87,7 +87,7 @@ runs:
         # The lack of compatibility snapshot (for example, for the new Postgres version)
         # shouldn't fail the whole job. Only relevant test should fail.
         skip-if-does-not-exist: true
-        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
+        aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }}
 
     - name: Checkout
       if: inputs.needs_postgres_source == 'true'
@@ -228,13 +228,13 @@ runs:
         # The lack of compatibility snapshot shouldn't fail the job
         # (for example if we didn't run the test for non build-and-test workflow)
         skip-if-does-not-exist: true
-        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
+        aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }}
 
     - uses: aws-actions/configure-aws-credentials@v4
       if: ${{ !cancelled() }}
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
+        role-to-assume: ${{ inputs.aws-oidc-role-arn }}
         role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
 
     - name: Upload test results
@@ -243,4 +243,4 @@ runs:
       with:
         report-dir: /tmp/test_output/allure/results
         unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }}
-        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
+        aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }}
diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml
index 1bbea5400f..a0e2ab8521 100644
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -14,11 +14,11 @@ runs:
         name: coverage-data-artifact
         path: /tmp/coverage
         skip-if-does-not-exist: true # skip if there's no previous coverage to download
-        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
+        aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }}
 
     - name: Upload coverage data
       uses: ./.github/actions/upload
       with:
         name: coverage-data-artifact
         path: /tmp/coverage
-        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
+        aws-oidc-role-arn: ${{ inputs.aws-oidc-role-arn }}
diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml
index ac5579ccea..ebb2443476 100644
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -14,7 +14,7 @@ inputs:
   prefix:
     description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
     required: false
-  aws-oicd-role-arn:
+  aws-oidc-role-arn:
     description: "the OIDC role arn for aws auth"
     required: false
     default: ""
@@ -61,7 +61,7 @@ runs:
       uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
+        role-to-assume: ${{ inputs.aws-oidc-role-arn }}
         role-duration-seconds: 3600
 
     - name: Upload artifact
diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index df107920c1..38d956bda7 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -81,7 +81,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     # we create a table that has one row for each database that we want to restore with the status whether the restore is done
     - name: Create benchmark_restore_status table if it does not exist
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 4f7d6026f2..7cede309f3 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -323,7 +323,7 @@ jobs:
         with:
           name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
           path: /tmp/neon
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Check diesel schema
         if: inputs.build-type == 'release' && inputs.arch == 'x64'
@@ -394,7 +394,7 @@ jobs:
           rerun_failed: ${{ inputs.test-run-count == 1 }}
           pg_version: ${{ matrix.pg_version }}
           sanitizers: ${{ inputs.sanitizers }}
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
           # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
           # Attempt to stop tests gracefully to generate test reports
           # until they are forcibly stopped by the stricter `timeout-minutes` limit.
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 8af23820f4..5107f457e2 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -114,7 +114,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       id: create-neon-project
@@ -132,7 +132,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
@@ -165,7 +165,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -222,8 +222,8 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-    
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
     - name: Verify that cumulative statistics are preserved
       uses: ./.github/actions/run-python-test-set
       with:
@@ -233,7 +233,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 3600
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -282,7 +282,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Run Logical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
@@ -293,7 +293,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -310,7 +310,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -322,7 +322,7 @@ jobs:
       uses: ./.github/actions/allure-report-generate
       with:
         store-test-results-into-db: true
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -505,7 +505,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       if: contains(fromJSON('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
@@ -557,7 +557,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -573,7 +573,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -588,7 +588,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -603,7 +603,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -621,7 +621,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -694,7 +694,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -726,7 +726,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -741,7 +741,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -752,7 +752,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -828,7 +828,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -871,7 +871,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -885,7 +885,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -954,7 +954,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Get Connstring Secret Name
       run: |
@@ -1003,7 +1003,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_tpch
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -1015,7 +1015,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -1078,7 +1078,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -1121,7 +1121,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -1132,7 +1132,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/build_and_run_selected_test.yml b/.github/workflows/build_and_run_selected_test.yml
index f22fe310ab..7f1eb991c4 100644
--- a/.github/workflows/build_and_run_selected_test.yml
+++ b/.github/workflows/build_and_run_selected_test.yml
@@ -93,7 +93,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_DEV }}
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f14222bb4e..1791cddacc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -317,7 +317,7 @@ jobs:
           extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
           benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
           pg_version: v16
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -384,7 +384,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -451,14 +451,14 @@ jobs:
         with:
           name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Get coverage artifact
         uses: ./.github/actions/download
         with:
           name: coverage-data-artifact
           path: /tmp/coverage
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Merge coverage data
         run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml
index c31b05fea2..c54448dedc 100644
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -117,7 +117,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index 7ae8d46000..d96c595294 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -89,7 +89,7 @@ jobs:
           name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
           path: /tmp/neon/
           prefix: latest
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Create a new branch
         id: create-branch
@@ -105,7 +105,7 @@ jobs:
           test_selection: cloud_regress
           pg_version: ${{matrix.pg-version}}
           extra_params: -m remote_cluster
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
 
@@ -122,7 +122,7 @@ jobs:
         if: ${{ !cancelled() }}
         uses: ./.github/actions/allure-report-generate
         with:
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Post to a Slack channel
         if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index 35e4838a86..a8d2d69807 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -32,7 +32,7 @@ jobs:
       fail-fast: false # allow other variants to continue even if one fails
       matrix:
         include:
-          - target_project: new_empty_project_stripe_size_2048 
+          - target_project: new_empty_project_stripe_size_2048
             stripe_size: 2048 # 16 MiB
             postgres_version: 16
             disable_sharding: false
@@ -98,7 +98,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
@@ -110,10 +110,10 @@ jobs:
         compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
         shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }}
-        admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} 
+        admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}
         shard_count: 8
         stripe_size: ${{ matrix.stripe_size }}
-        disable_sharding: ${{ matrix.disable_sharding }} 
+        disable_sharding: ${{ matrix.disable_sharding }}
 
     - name: Initialize Neon project
       if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
@@ -171,7 +171,7 @@ jobs:
         extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
         pg_version: v${{ matrix.postgres_version }}
         save_perf_report: true
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
         TARGET_PROJECT_TYPE: ${{ matrix.target_project }}
diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml
index 2b63cbd044..42dcc8e918 100644
--- a/.github/workflows/large_oltp_benchmark.yml
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -33,9 +33,9 @@ jobs:
       fail-fast: false # allow other variants to continue even if one fails
       matrix:
         include:
-          - target: new_branch 
+          - target: new_branch
             custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
-          - target: reuse_branch 
+          - target: reuse_branch
             custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
       max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
     permissions:
@@ -43,7 +43,7 @@ jobs:
       statuses: write
       id-token: write # aws-actions/configure-aws-credentials
     env:
-      TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h 
+      TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h
       TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       PG_VERSION: 16 # pre-determined by pre-determined project
@@ -85,7 +85,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Branch for large tenant
       if: ${{ matrix.target == 'new_branch' }}
@@ -129,7 +129,7 @@ jobs:
         ${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';"
         echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs"
 
-    - name: Benchmark pgbench with custom-scripts 
+    - name: Benchmark pgbench with custom-scripts
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
@@ -138,7 +138,7 @@ jobs:
         save_perf_report: true
         extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -153,7 +153,7 @@ jobs:
         save_perf_report: true
         extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance
         pg_version: ${{ env.PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -179,8 +179,8 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-  
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 0fe002bc07..532da435c2 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -147,7 +147,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index fa4fd73b12..6efe0b4c8c 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -103,7 +103,7 @@ jobs:
           name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
           path: /tmp/neon/
           prefix: latest
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Create Neon Project
         id: create-neon-project
@@ -122,7 +122,7 @@ jobs:
           run_in_parallel: false
           extra_params: -m remote_cluster
           pg_version: ${{ env.DEFAULT_PG_VERSION }}
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
 
@@ -139,7 +139,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -178,7 +178,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       id: create-neon-project
@@ -195,7 +195,7 @@ jobs:
         run_in_parallel: false
         extra_params: -m remote_cluster
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
 
@@ -212,7 +212,7 @@ jobs:
       uses: ./.github/actions/allure-report-generate
       with:
         store-test-results-into-db: true
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
diff --git a/.github/workflows/random-ops-test.yml b/.github/workflows/random-ops-test.yml
index 7c19537744..6098126e90 100644
--- a/.github/workflows/random-ops-test.yml
+++ b/.github/workflows/random-ops-test.yml
@@ -66,7 +66,7 @@ jobs:
           name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
           path: /tmp/neon/
           prefix: latest
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Run tests
         uses: ./.github/actions/run-python-test-set
@@ -76,7 +76,7 @@ jobs:
           run_in_parallel: false
           extra_params: -m remote_cluster
           pg_version: ${{ matrix.pg-version }}
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
           RANDOM_SEED: ${{ inputs.random_seed }}
@@ -88,6 +88,6 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
-          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

From 692c0f3fb87c31e1b995412e37ab8368a48b5a13 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 28 Apr 2025 16:24:18 +0300
Subject: [PATCH 06/76] Prepare to prewarm support (#11740)

## Problem

See
(original prewarm implementation)
https://github.com/neondatabase/neon/pull/9197
(functions for storing/restoring LFC state)
https://github.com/neondatabase/neon/pull/9587
(store prefetch results in LFC)
https://github.com/neondatabase/neon/pull/10442

## Summary of changes

Preparation for prewarm implementation.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/communicator.c     |  32 +++++---
 pgxn/neon/communicator.h     |   2 +
 pgxn/neon/file_cache.c       | 142 +++++++++++++++++++++++++----------
 pgxn/neon/libpagestore.c     |   5 +-
 pgxn/neon/pagestore_client.h |   3 +-
 5 files changed, 133 insertions(+), 51 deletions(-)

diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index db3e053321..61bb3206e7 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -88,9 +88,6 @@ typedef PGAlignedBlock PGIOAlignedBlock;
 
 page_server_api *page_server;
 
-static uint32 local_request_counter;
-#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
-
 /*
  * Various settings related to prompt (fast) handling of PageStream responses
  * at any CHECK_FOR_INTERRUPTS point.
@@ -788,6 +785,27 @@ prefetch_read(PrefetchRequest *slot)
 	}
 }
 
+
+/*
+ * Wait completion of previosly registered prefetch request.
+ * Prefetch result should be placed in LFC by prefetch_wait_for.
+ */
+bool
+communicator_prefetch_receive(BufferTag tag)
+{
+	PrfHashEntry *entry;
+	PrefetchRequest hashkey;
+
+	hashkey.buftag = tag;
+	entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+	if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index))
+	{
+		prefetch_set_unused(entry->slot->my_ring_index);
+		return true;
+	}
+	return false;
+}
+
 /*
  * Disconnect hook - drop prefetches when the connection drops
  *
@@ -906,7 +924,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 
 	NeonGetPageRequest request = {
 		.hdr.tag = T_NeonGetPageRequest,
-		.hdr.reqid = GENERATE_REQUEST_ID(),
 		/* lsn and not_modified_since are filled in below */
 		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
 		.forknum = slot->buftag.forkNum,
@@ -915,8 +932,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 
 	Assert(mySlotNo == MyPState->ring_unused);
 
-	slot->reqid = request.hdr.reqid;
-
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
@@ -934,6 +949,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 		Assert(mySlotNo == MyPState->ring_unused);
 		/* loop */
 	}
+	slot->reqid = request.hdr.reqid;
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -1937,7 +1953,6 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 	{
 		NeonExistsRequest request = {
 			.hdr.tag = T_NeonExistsRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
 			.hdr.lsn = request_lsns->request_lsn,
 			.hdr.not_modified_since = request_lsns->not_modified_since,
 			.rinfo = rinfo,
@@ -2212,7 +2227,6 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 	{
 		NeonNblocksRequest request = {
 			.hdr.tag = T_NeonNblocksRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
 			.hdr.lsn = request_lsns->request_lsn,
 			.hdr.not_modified_since = request_lsns->not_modified_since,
 			.rinfo = rinfo,
@@ -2285,7 +2299,6 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 	{
 		NeonDbSizeRequest request = {
 			.hdr.tag = T_NeonDbSizeRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
 			.hdr.lsn = request_lsns->request_lsn,
 			.hdr.not_modified_since = request_lsns->not_modified_since,
 			.dbNode = dbNode,
@@ -2353,7 +2366,6 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 
 	request = (NeonGetSlruSegmentRequest) {
 		.hdr.tag = T_NeonGetSlruSegmentRequest,
-		.hdr.reqid = GENERATE_REQUEST_ID(),
 		.hdr.lsn = request_lsns->request_lsn,
 		.hdr.not_modified_since = request_lsns->not_modified_since,
 		.kind = kind,
diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h
index 72cba526c1..f55c4b10f1 100644
--- a/pgxn/neon/communicator.h
+++ b/pgxn/neon/communicator.h
@@ -37,6 +37,8 @@ extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum,
 										 BlockNumber nblocks, void **buffers, bits8 *mask);
 extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 												   BlockNumber nblocks, const bits8 *mask);
+extern bool communicator_prefetch_receive(BufferTag tag);
+
 extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
 										  neon_request_lsns *request_lsns,
 										  void *buffer);
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 8c2990e57a..e2c1f7682f 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -25,6 +25,7 @@
 #include "pgstat.h"
 #include "port/pg_iovec.h"
 #include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
 #include "storage/fd.h"
@@ -32,6 +33,8 @@
 #include "storage/latch.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
+#include "storage/procsignal.h"
+#include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 #include "utils/dynahash.h"
 #include "utils/guc.h"
@@ -46,6 +49,8 @@
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
+#include "pagestore_client.h"
+#include "communicator.h"
 
 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
 
@@ -87,14 +92,14 @@
  *    1Mb chunks can reduce hash map size to 320Mb.
  * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
  */
-#define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
-/*
- * Smaller chunk seems to be better for OLTP workload
- */
-// #define BLOCKS_PER_CHUNK	8 /* 64kb chunk */
+#define MAX_BLOCKS_PER_CHUNK_LOG  7 /* 1Mb chunk */
+#define MAX_BLOCKS_PER_CHUNK	  (1 << MAX_BLOCKS_PER_CHUNK_LOG)
+
 #define MB					((uint64)1024*1024)
 
-#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
+#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log))
+
+#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1))
 
 /*
  * Blocks are read or written to LFC file outside LFC critical section.
@@ -119,10 +124,11 @@ typedef struct FileCacheEntry
 	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		state[(BLOCKS_PER_CHUNK + 31) / 32 * 2]; /* two bits per block */
 	dlist_node	list_node;		/* LRU/holes list node */
+	uint32		state[FLEXIBLE_ARRAY_MEMBER]; /* two bits per block */
 } FileCacheEntry;
 
+#define FILE_CACHE_ENRTY_SIZE MAXALIGN(offsetof(FileCacheEntry, state) + (lfc_blocks_per_chunk*2+31)/32*4)
 #define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3)
 #define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2))
 
@@ -136,6 +142,7 @@ typedef struct FileCacheControl
 	uint32		size;			/* size of cache file in chunks */
 	uint32		used;			/* number of used chunks */
 	uint32		used_pages;		/* number of used pages */
+	uint32		pinned;			/* number of pinned chunks */
 	uint32		limit;			/* shared copy of lfc_size_limit */
 	uint64		hits;
 	uint64		misses;
@@ -158,6 +165,8 @@ static int	lfc_desc = -1;
 static LWLockId lfc_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
+static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
+static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
 static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
@@ -206,7 +215,9 @@ lfc_switch_off(void)
 		}
 		lfc_ctl->generation += 1;
 		lfc_ctl->size = 0;
+		lfc_ctl->pinned = 0;
 		lfc_ctl->used = 0;
+		lfc_ctl->used_pages = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
 		dlist_init(&lfc_ctl->holes);
@@ -296,7 +307,7 @@ lfc_shmem_startup(void)
 
 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
-		info.entrysize = sizeof(FileCacheEntry);
+		info.entrysize = FILE_CACHE_ENRTY_SIZE;
 
 		/*
 		 * n_chunks+1 because we add new element to hash table before eviction
@@ -342,7 +353,7 @@ lfc_shmem_request(void)
 		prev_shmem_request_hook();
 #endif
 
-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
 	RequestNamedLWLockTranche("lfc_lock", 1);
 }
 
@@ -359,6 +370,24 @@ is_normal_backend(void)
 	return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
 }
 
+static bool
+lfc_check_chunk_size(int *newval, void **extra, GucSource source)
+{
+	if (*newval & (*newval - 1))
+	{
+		elog(ERROR, "LFC chunk size should be power of two");
+		return false;
+	}
+	return true;
+}
+
+static void
+lfc_change_chunk_size(int newval, void* extra)
+{
+	lfc_chunk_size_log = pg_ceil_log2_32(newval);
+}
+
+
 static bool
 lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 {
@@ -415,11 +444,11 @@ lfc_change_limit_hook(int newval, void *extra)
 
 		CriticalAssert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * lfc_blocks_per_chunk * BLCKSZ, lfc_blocks_per_chunk * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
 		/* We remove the old entry, and re-enter a hole to the hash table */
-		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+		for (int i = 0; i < lfc_blocks_per_chunk; i++)
 		{
 			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
 			lfc_ctl->used_pages -= is_page_cached;
@@ -508,6 +537,19 @@ lfc_init(void)
 							   NULL,
 							   NULL);
 
+	DefineCustomIntVariable("neon.file_cache_chunk_size",
+							"LFC chunk size in blocks (should be power of two)",
+							NULL,
+							&lfc_blocks_per_chunk,
+							MAX_BLOCKS_PER_CHUNK,
+							1,
+							MAX_BLOCKS_PER_CHUNK,
+							PGC_POSTMASTER,
+							GUC_UNIT_BLOCKS,
+							lfc_check_chunk_size,
+							lfc_change_chunk_size,
+							NULL);
+
 	if (lfc_max_size == 0)
 		return;
 
@@ -530,7 +572,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	int			chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
 	bool		found = false;
 	uint32		hash;
 
@@ -539,7 +581,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno - chunk_offs;
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
@@ -577,9 +619,9 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
+	tag.blockNum = blkno - chunk_offs;
 	hash = get_hash_value(lfc_hash, &tag);
-	chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 
 	LWLockAcquire(lfc_lock, LW_SHARED);
 
@@ -590,12 +632,12 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	while (true)
 	{
-		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
+		int		this_chunk = Min(nblocks - i, lfc_blocks_per_chunk - chunk_offs);
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
 
 		if (entry != NULL)
 		{
-			for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
+			for (; chunk_offs < lfc_blocks_per_chunk && i < nblocks; chunk_offs++, i++)
 			{
 				if (GET_STATE(entry, chunk_offs) != UNAVAILABLE)
 				{
@@ -619,9 +661,9 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 * Prepare for the next iteration. We don't unlock here, as that'd
 		 * probably be more expensive than the gains it'd get us.
 		 */
-		tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno + i);
+		tag.blockNum = (blkno + i) - chunk_offs;
 		hash = get_hash_value(lfc_hash, &tag);
-		chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
 	}
 
 	LWLockRelease(lfc_lock);
@@ -696,9 +738,9 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
-		int8	chunk_mask[BLOCKS_PER_CHUNK / 8] = {0};
-		int		chunk_offs = (blkno & (BLOCKS_PER_CHUNK - 1));
-		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		uint8	chunk_mask[MAX_BLOCKS_PER_CHUNK / 8] = {0};
+		int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
+		int		blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
 		uint64	io_time_us = 0;
@@ -786,8 +828,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 		/* Unlink entry from LRU list to pin it for the duration of IO operation */
 		if (entry->access_count++ == 0)
+		{
+			lfc_ctl->pinned += 1;
 			dlist_delete(&entry->list_node);
-
+		}
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
 
@@ -836,7 +880,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		if (iteration_hits != 0)
 		{
 			/* chunk offset (# of pages) into the LFC file */
-			off_t	first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK;
+			off_t	first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk;
 			int		nwrite = iov_last_used - first_block_in_chunk_read;
 			/* offset of first IOV */
 			first_read_offset += chunk_offs + first_block_in_chunk_read;
@@ -884,7 +928,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 			CriticalAssert(entry->access_count > 0);
 			if (--entry->access_count == 0)
+			{
+				lfc_ctl->pinned -= 1;
 				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+			}
 		}
 		else
 		{
@@ -961,7 +1008,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
 												 dlist_pop_head_node(&lfc_ctl->lru));
 
-		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+		for (int i = 0; i < lfc_blocks_per_chunk; i++)
 		{
 			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
 			lfc_ctl->used_pages -= is_page_cached;
@@ -979,14 +1026,14 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 		/* Can't add this chunk - we don't have the space for it */
 		hash_search_with_hash_value(lfc_hash, &entry->key, hash,
 									HASH_REMOVE, NULL);
-
 		return false;
 	}
 
 	entry->access_count = 1;
 	entry->hash = hash;
+	lfc_ctl->pinned += 1;
 
-	for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+	for (int i = 0; i < lfc_blocks_per_chunk; i++)
 		SET_STATE(entry, i, UNAVAILABLE);
 
 	return true;
@@ -1031,7 +1078,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	FileCacheBlockState state;
 	XLogRecPtr lwlsn;
 
-	int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;
@@ -1041,7 +1088,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno - chunk_offs;
 	hash = get_hash_value(lfc_hash, &tag);
 	cv = &lfc_ctl->cv[hash % N_COND_VARS];
 
@@ -1052,7 +1099,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		LWLockRelease(lfc_lock);
 		return false;
 	}
-	
+
 	lwlsn = neon_get_lwlsn(rinfo, forknum, blkno);
 
 	if (lwlsn > lsn)
@@ -1081,7 +1128,10 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * operation
 		 */
 		if (entry->access_count++ == 0)
+		{
+			lfc_ctl->pinned += 1;
 			dlist_delete(&entry->list_node);
+		}
 	}
 	else
 	{
@@ -1106,7 +1156,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 	INSTR_TIME_SET_CURRENT(io_start);
 	rc = pwrite(lfc_desc, buffer, BLCKSZ,
-				((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+				((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
 	INSTR_TIME_SET_CURRENT(io_end);
 	pgstat_report_wait_end();
 
@@ -1132,7 +1182,10 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 			inc_page_cache_write_wait(time_spent_us);
 
 			if (--entry->access_count == 0)
+			{
+				lfc_ctl->pinned -= 1;
 				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+			}
 
 			state = GET_STATE(entry, chunk_offs);
 			if (state == REQUESTED) {
@@ -1199,8 +1252,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
-		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
+		int		blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
 		instr_time io_start, io_end;
 		ConditionVariable* cv;
 
@@ -1212,7 +1265,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			iov[i].iov_len = BLCKSZ;
 		}
 
-		tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+		tag.blockNum = blkno - chunk_offs;
 		hash = get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];
 
@@ -1232,7 +1285,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			 * operation
 			 */
 			if (entry->access_count++ == 0)
+			{
+				lfc_ctl->pinned += 1;
 				dlist_delete(&entry->list_node);
+			}
 		}
 		else
 		{
@@ -1285,7 +1341,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 		INSTR_TIME_SET_CURRENT(io_start);
 		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
-					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+					 ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
 		INSTR_TIME_SET_CURRENT(io_end);
 		pgstat_report_wait_end();
 
@@ -1312,7 +1368,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				inc_page_cache_write_wait(time_spent_us);
 
 				if (--entry->access_count == 0)
+				{
+					lfc_ctl->pinned -= 1;
 					dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+				}
 
 				for (int i = 0; i < blocks_in_chunk; i++)
 				{
@@ -1438,7 +1497,12 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			break;
 		case 8:
 			key = "file_cache_chunk_size_pages";
-			value = BLOCKS_PER_CHUNK;
+			value = lfc_blocks_per_chunk;
+			break;
+		case 9:
+			key = "file_cache_chunks_pinned";
+			if (lfc_ctl)
+				value = lfc_ctl->pinned;
 			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
@@ -1566,7 +1630,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 					/* Skip hole tags */
 					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
 					{
-						for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+						for (int i = 0; i < lfc_blocks_per_chunk; i++)
 							n_pages += GET_STATE(entry, i) == AVAILABLE;
 					}
 				}
@@ -1594,13 +1658,13 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			hash_seq_init(&status, lfc_hash);
 			while ((entry = hash_seq_search(&status)) != NULL)
 			{
-				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+				for (int i = 0; i < lfc_blocks_per_chunk; i++)
 				{
 					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
 					{
 						if (GET_STATE(entry, i) == AVAILABLE)
 						{
-							fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+							fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
 							fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
 							fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
 							fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 64d38e7913..ccb072d6f9 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -48,7 +48,6 @@
 #define MIN_RECONNECT_INTERVAL_USEC 1000
 #define MAX_RECONNECT_INTERVAL_USEC 1000000
 
-
 enum NeonComputeMode {
 	CP_MODE_PRIMARY = 0,
 	CP_MODE_REPLICA,
@@ -167,6 +166,9 @@ typedef struct
 	WaitEventSet   *wes_read;
 } PageServer;
 
+static uint32 local_request_counter;
+#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
+
 static PageServer page_servers[MAX_SHARDS];
 
 static bool pageserver_flush(shardno_t shard_no);
@@ -994,6 +996,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 		pageserver_conn = NULL;
 	}
 
+	request->reqid = GENERATE_REQUEST_ID();
 	req_buff = nm_pack_request(request);
 
 	/*
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 0ab539fe56..9df202290d 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -65,7 +65,6 @@ typedef enum {
 	SLRU_MULTIXACT_OFFSETS
 } SlruKind;
 
-
 /*--
  * supertype of all the Neon*Request structs below.
  *
@@ -129,6 +128,7 @@ typedef struct
 	int			segno;
 } NeonGetSlruSegmentRequest;
 
+
 /* supertype of all the Neon*Response structs below */
 typedef NeonMessage NeonResponse;
 
@@ -187,6 +187,7 @@ typedef struct
 {
 	/*
 	 * Send this request to the PageServer associated with this shard.
+	 * This function assigns request_id to the request which can be extracted by caller from request struct.
 	 */
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
 	/*

From 11f6044338b0c3b4596053e39ffd3e745d63b28b Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 28 Apr 2025 09:45:45 -0400
Subject: [PATCH 07/76] fix(pageserver): report synthetic size = 1 if all tls
 offloaded (2) (#11731)

## Problem

https://github.com/neondatabase/neon/pull/11648 did this for resident
size instead of synthetic size.

## Summary of changes

Report synthetic_size == 1 if all timelines are offloaded.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/tenant_size_model/src/calculation.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs
index d54876ba2c..77e72035ee 100644
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -77,7 +77,9 @@ impl StorageModel {
         }
 
         SizeResult {
-            total_size,
+            // If total_size is 0, it means that the tenant has all timelines offloaded; we need to report 1
+            // here so that the data point shows up in the s3 files.
+            total_size: total_size.max(1),
             segments: segment_results,
         }
     }

From 84bc3380cc82a27b06717d2f239ac1fea74880fc Mon Sep 17 00:00:00 2001
From: "devin-ai-integration[bot]"
 <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 28 Apr 2025 17:34:47 +0200
Subject: [PATCH 08/76] Remove SAFEKEEPER_AUTH_TOKEN env var parsing from
 safekeeper (#11698)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Remove SAFEKEEPER_AUTH_TOKEN env var parsing from safekeeper

This PR is a follow-up to #11443 that removes the parsing of the
`SAFEKEEPER_AUTH_TOKEN` environment variable from the safekeeper
codebase while keeping the `auth_token_path` CLI flag functionality.

## Changes:
- Removed code that checks for the `SAFEKEEPER_AUTH_TOKEN` environment
variable
- Updated comments to reflect that only the `auth_token_path` CLI flag
is now used

As mentioned in PR #11443, the environment variable approach was planned
to be deprecated and removed in favor of the file-based approach, which
is more secure since environment variables can be quite public in both
procfs and unit files.

Link to Devin run:
https://app.devin.ai/sessions/d6f56cf1b4164ea9880a9a06358a58ac

Requested by: arpad@neon.tech

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: arpad@neon.tech <arpad@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 control_plane/src/safekeeper.rs  | 39 ++++++++++++++++++++------------
 safekeeper/src/bin/safekeeper.rs | 31 ++++++-------------------
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 948e3c8c93..eec2c997e6 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -112,7 +112,7 @@ impl SafekeeperNode {
     }
 
     /// Initializes a safekeeper node by creating all necessary files,
-    /// e.g. SSL certificates.
+    /// e.g. SSL certificates and JWT token file.
     pub fn initialize(&self) -> anyhow::Result<()> {
         if self.env.generate_local_ssl_certs {
             self.env.generate_ssl_cert(
@@ -120,6 +120,17 @@ impl SafekeeperNode {
                 &self.datadir_path().join("server.key"),
             )?;
         }
+
+        // Generate a token file for authentication with other safekeepers
+        if self.conf.auth_enabled {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
+
+            let token_path = self.datadir_path().join("peer_jwt_token");
+            std::fs::write(token_path, token)?;
+        }
+
         Ok(())
     }
 
@@ -218,14 +229,26 @@ impl SafekeeperNode {
             args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
         }
 
+        if self.conf.auth_enabled {
+            let token_path = self.datadir_path().join("peer_jwt_token");
+            let token_path_str = token_path
+                .to_str()
+                .with_context(|| {
+                    format!("Token path {token_path:?} cannot be represented as a unicode string")
+                })?
+                .to_owned();
+            args.extend(["--auth-token-path".to_owned(), token_path_str]);
+        }
+
         args.extend_from_slice(extra_opts);
 
+        let env_variables = Vec::new();
         background_process::start_process(
             &format!("safekeeper-{id}"),
             &datadir,
             &self.env.safekeeper_bin(),
             &args,
-            self.safekeeper_env_variables()?,
+            env_variables,
             background_process::InitialPidFile::Expect(self.pid_file()),
             retry_timeout,
             || async {
@@ -239,18 +262,6 @@ impl SafekeeperNode {
         .await
     }
 
-    fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
-        // Generate a token to connect from safekeeper to peers
-        if self.conf.auth_enabled {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
-            Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
-        } else {
-            Ok(Vec::new())
-        }
-    }
-
     ///
     /// Stop the server.
     ///
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index dd71420efb..c267a55cb6 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -1,7 +1,6 @@
 //
 // Main entry point for the safekeeper executable
 //
-use std::env::{VarError, var};
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
@@ -354,29 +353,13 @@ async fn main() -> anyhow::Result<()> {
     };
 
     // Load JWT auth token to connect to other safekeepers for pull_timeline.
-    // First check if the env var is present, then check the arg with the path.
-    // We want to deprecate and remove the env var method in the future.
-    let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") {
-        Ok(v) => {
-            info!("loaded JWT token for authentication with safekeepers");
-            Some(SecretString::from(v))
-        }
-        Err(VarError::NotPresent) => {
-            if let Some(auth_token_path) = args.auth_token_path.as_ref() {
-                info!(
-                    "loading JWT token for authentication with safekeepers from {auth_token_path}"
-                );
-                let auth_token = tokio::fs::read_to_string(auth_token_path).await?;
-                Some(SecretString::from(auth_token.trim().to_owned()))
-            } else {
-                info!("no JWT token for authentication with safekeepers detected");
-                None
-            }
-        }
-        Err(_) => {
-            warn!("JWT token for authentication with safekeepers is not unicode");
-            None
-        }
+    let sk_auth_token = if let Some(auth_token_path) = args.auth_token_path.as_ref() {
+        info!("loading JWT token for authentication with safekeepers from {auth_token_path}");
+        let auth_token = tokio::fs::read_to_string(auth_token_path).await?;
+        Some(SecretString::from(auth_token.trim().to_owned()))
+    } else {
+        info!("no JWT token for authentication with safekeepers detected");
+        None
     };
 
     let ssl_ca_certs = match args.ssl_ca_file.as_ref() {

From b1fa68f6592672a8e988b85f0de83749fece2dab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Mon, 28 Apr 2025 18:37:36 +0200
Subject: [PATCH 09/76] impr(ci): switch release PR creation over to use python
 based action (#11679)

## Problem
Our different repositories had both had code to achieve very similar
results in terms of release PR creation, but they were structured
differently and had different extensions. This was likely to cause
maintainability problems in the long run.

## Summary of changes
Switch to a python cli based composite action for creating the release
PRs that will also be introduced in our other repos later.

## To Do
- [ ] Adjust our docs to reflect the changes from this.
---
 .github/workflows/_create-release-pr.yml | 103 -----------------------
 .github/workflows/release-compute.yml    |  12 +++
 .github/workflows/release-proxy.yml      |  12 +++
 .github/workflows/release-storage.yml    |  12 +++
 .github/workflows/release.yml            |  93 ++++++++++----------
 5 files changed, 82 insertions(+), 150 deletions(-)
 delete mode 100644 .github/workflows/_create-release-pr.yml
 create mode 100644 .github/workflows/release-compute.yml
 create mode 100644 .github/workflows/release-proxy.yml
 create mode 100644 .github/workflows/release-storage.yml

diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml
deleted file mode 100644
index f96ed7d69b..0000000000
--- a/.github/workflows/_create-release-pr.yml
+++ /dev/null
@@ -1,103 +0,0 @@
-name: Create Release PR
-
-on:
-  workflow_call:
-    inputs:
-      component-name:
-        description: 'Component name'
-        required: true
-        type: string
-      source-branch:
-        description: 'Source branch'
-        required: true
-        type: string
-    secrets:
-      ci-access-token:
-        description: 'CI access token'
-        required: true
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-permissions:
-  contents: read
-
-jobs:
-  create-release-branch:
-    runs-on: ubuntu-22.04
-
-    permissions:
-      contents: write # for `git push`
-
-    steps:
-    - name: Harden the runner (Audit all outbound calls)
-      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
-      with:
-        egress-policy: audit
-
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      with:
-        ref: ${{ inputs.source-branch }}
-        fetch-depth: 0
-
-    - name: Set variables
-      id: vars
-      env:
-        COMPONENT_NAME: ${{ inputs.component-name }}
-        RELEASE_BRANCH: >-
-          ${{
-            false
-            || inputs.component-name == 'Storage' && 'release'
-            || inputs.component-name == 'Proxy' && 'release-proxy'
-            || inputs.component-name == 'Compute' && 'release-compute'
-          }}
-      run: |
-        now_date=$(date -u +'%Y-%m-%d')
-        now_time=$(date -u +'%H-%M-%Z')
-        {
-          echo "title=${COMPONENT_NAME} release ${now_date}"
-          echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
-          echo "release-branch=${RELEASE_BRANCH}"
-        } | tee -a ${GITHUB_OUTPUT}
-
-    - name: Configure git
-      run: |
-        git config user.name "github-actions[bot]"
-        git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-
-    - name: Create RC branch
-      env:
-        RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
-        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
-        TITLE: ${{ steps.vars.outputs.title }}
-      run: |
-        git switch -c "${RC_BRANCH}"
-
-        # Manually create a merge commit on the current branch, keeping the
-        # tree and setting the parents to the current HEAD and the HEAD of the
-        # release branch. This commit is what we'll fast-forward the release
-        # branch to when merging the release branch.
-        # For details on why, look at
-        # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs
-        current_tree=$(git rev-parse 'HEAD^{tree}')
-        release_head=$(git rev-parse "origin/${RELEASE_BRANCH}")
-        current_head=$(git rev-parse HEAD)
-        merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}")
-
-        # Fast-forward the current branch to the newly created merge_commit
-        git merge --ff-only ${merge_commit}
-
-        git push origin "${RC_BRANCH}"
-
-    - name: Create a PR into ${{ steps.vars.outputs.release-branch }}
-      env:
-        GH_TOKEN: ${{ secrets.ci-access-token }}
-        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
-        RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
-        TITLE: ${{ steps.vars.outputs.title }}
-      run: |
-        gh pr create --title "${TITLE}" \
-                     --body "" \
-                     --head "${RC_BRANCH}" \
-                     --base "${RELEASE_BRANCH}"
diff --git a/.github/workflows/release-compute.yml b/.github/workflows/release-compute.yml
new file mode 100644
index 0000000000..f123dd2f44
--- /dev/null
+++ b/.github/workflows/release-compute.yml
@@ -0,0 +1,12 @@
+name: Create compute release PR
+
+on:
+  schedule:
+    - cron: '0 7 * * FRI'
+
+jobs:
+  create-release-pr:
+    uses: ./.github/workflows/release.yml
+    with:
+      component: compute
+    secrets: inherit
diff --git a/.github/workflows/release-proxy.yml b/.github/workflows/release-proxy.yml
new file mode 100644
index 0000000000..d9055984d2
--- /dev/null
+++ b/.github/workflows/release-proxy.yml
@@ -0,0 +1,12 @@
+name: Create proxy release PR
+
+on:
+  schedule:
+    - cron: '0 6 * * TUE'
+
+jobs:
+  create-release-pr:
+    uses: ./.github/workflows/release.yml
+    with:
+      component: proxy
+    secrets: inherit
diff --git a/.github/workflows/release-storage.yml b/.github/workflows/release-storage.yml
new file mode 100644
index 0000000000..91f02fddda
--- /dev/null
+++ b/.github/workflows/release-storage.yml
@@ -0,0 +1,12 @@
+name: Create storage release PR
+
+on:
+  schedule:
+    - cron: '0 6 * * FRI'
+
+jobs:
+  create-release-pr:
+    uses: ./.github/workflows/release.yml
+    with:
+      component: storage
+    secrets: inherit
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4068eafb95..4b19d6aa3f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,25 +1,34 @@
-name: Create Release Branch
+name: Create release PR
 
 on:
-  schedule:
-    # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * TUE' # Proxy release
-    - cron: '0 6 * * FRI' # Storage release
-    - cron: '0 7 * * FRI' # Compute release
   workflow_dispatch:
     inputs:
-      create-storage-release-branch:
-        type: boolean
-        description: 'Create Storage release PR'
+      component:
+        description: "Component to release"
+        required: true
+        type: choice
+        options:
+          - compute
+          - proxy
+          - storage
+      cherry-pick:
+        description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
         required: false
-      create-proxy-release-branch:
-        type: boolean
-        description: 'Create Proxy release PR'
-        required: false
-      create-compute-release-branch:
-        type: boolean
-        description: 'Create Compute release PR'
+        type: string
+        default: ''
+
+  workflow_call:
+    inputs:
+      component:
+        description: "Component to release"
+        required: true
+        type: string
+      cherry-pick:
+        description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
         required: false
+        type: string
+        default: ''
+
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -29,41 +38,31 @@ defaults:
     shell: bash -euo pipefail {0}
 
 jobs:
-  create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}
+  create-release-pr:
+    runs-on: ubuntu-22.04
 
     permissions:
       contents: write
 
-    uses: ./.github/workflows/_create-release-pr.yml
-    with:
-      component-name: 'Storage'
-      source-branch: ${{ github.ref_name }}
-    secrets:
-      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
 
-  create-proxy-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }}
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
 
-    permissions:
-      contents: write
+      - name: Configure git
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
 
-    uses: ./.github/workflows/_create-release-pr.yml
-    with:
-      component-name: 'Proxy'
-      source-branch: ${{ github.ref_name }}
-    secrets:
-      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
-
-  create-compute-release-branch:
-    if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}
-
-    permissions:
-      contents: write
-
-    uses: ./.github/workflows/_create-release-pr.yml
-    with:
-      component-name: 'Compute'
-      source-branch: ${{ github.ref_name }}
-    secrets:
-      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+      - name: Create release PR
+        uses: neondatabase/dev-actions/release-pr@02b41460646b70d12dd33e5f56ebc5af2384c993
+        with:
+          component: ${{ inputs.component }}
+          cherry-pick: ${{ inputs.cherry-pick }}
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}

From 998d2c2ce9315f2ec7b7bc2b5ce0253e6c758d9d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Apr 2025 17:43:35 +0100
Subject: [PATCH 10/76] storcon: use shard 0's initdb for timeline creation
 (#11727)

## Problem

In princple, pageservers with different postgres binaries might generate
different initdbs, resulting in inconsistency between shards. To avoid
that, we should have shard 0 generate the initdb and other shards re-use
it.

Fixes: https://github.com/neondatabase/neon/issues/11340

## Summary of changes

- For shards with index greater than zero, set
`existing_initdb_timeline_id` in timeline creation to consume the
existing initdb rather than creating a new one
---
 storage_controller/src/service.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ca9b911c4d..0f71a87f13 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3720,6 +3720,10 @@ impl Service {
             // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
             // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
             // that will get the first creation request, and propagate the LSN to all the >0 shards.
+            //
+            // This also enables non-zero shards to use the initdb that shard 0 generated and uploaded to S3, rather than
+            // independently generating their own initdb.  This guarantees that shards cannot end up with different initial
+            // states if e.g. they have different postgres binary versions.
             let timeline_info = create_one(
                 shard_zero_tid,
                 shard_zero_locations,
@@ -3729,11 +3733,16 @@ impl Service {
             )
             .await?;
 
-            // Propagate the LSN that shard zero picked, if caller didn't provide one
+            // Update the create request for shards >= 0
             match &mut create_req.mode {
                 models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => {
+                    // Propagate the LSN that shard zero picked, if caller didn't provide one
                     *ancestor_start_lsn = timeline_info.ancestor_lsn;
                 },
+                models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } => {
+                    // For shards >= 0, do not run initdb: use the one that shard 0 uploaded to S3
+                    *existing_initdb_timeline_id = Some(create_req.new_timeline_id)
+                }
                 _ => {}
             }
 

From a750026c2e69344908464c59ba820fc0639abeb1 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 28 Apr 2025 12:09:48 -0500
Subject: [PATCH 11/76] Fix compiler warning in libpagestore.c when
 WITH_SANITIZERS=yes (#11755)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Postgres has a nice self-documenting macro called pg_unreachable() when
you want to assert that a location in code won't be hit.

Warning in question:

```
/home/tristan957/Projects/work/neon//pgxn/neon/libpagestore.c: In function ‘pageserver_connect’:
/home/tristan957/Projects/work/neon//pgxn/neon/libpagestore.c:739:1: warning: control reaches end of non-void function [-Wreturn-type]
  739 | }
      | ^
```

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/libpagestore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index ccb072d6f9..5287c12a84 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -736,8 +736,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	default:
 		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
 	}
-	/* This shouldn't be hit */
-	Assert(false);
+
+	pg_unreachable();
 }
 
 static void

From 04826905340da9ac80aa96b2892a137eb91b3b7d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Apr 2025 18:24:55 +0100
Subject: [PATCH 12/76] pageserver: make control_plane_api & generations fully
 mandatory (#10715)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We had retained the ability to run in a generation-less mode to support
test_generations_upgrade, which was replaced with a cleaner backward
compat test in https://github.com/neondatabase/neon/pull/10701

## Summary of changes

- Remove all the special cases for "if no generation" or "if no control
plane api"
- Make control_plane_api config mandatory

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 .../pageserver_config/pageserver.toml         |  2 ++
 pageserver/src/bin/pageserver.rs              |  2 +-
 pageserver/src/config.rs                      | 13 +++++++---
 pageserver/src/controller_upcall_client.rs    | 20 ++++++--------
 pageserver/src/deletion_queue.rs              | 11 +++-----
 pageserver/src/deletion_queue/validator.rs    | 26 ++++++++-----------
 pageserver/src/tenant.rs                      |  4 +--
 pageserver/src/tenant/mgr.rs                  | 19 +++-----------
 .../src/tenant/timeline/import_pgdata.rs      |  3 +--
 test_runner/fixtures/neon_fixtures.py         |  3 +--
 .../regress/test_pageserver_generations.py    |  2 +-
 11 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml
index 76935453b6..7d603b6c65 100644
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -3,3 +3,5 @@ pg_distrib_dir='/usr/local/'
 listen_pg_addr='0.0.0.0:6400'
 listen_http_addr='0.0.0.0:9898'
 remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
+control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
+control_plane_emergency_mode=true
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 6cfaec955b..4c2572a577 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -504,7 +504,7 @@ fn start_pageserver(
     // Set up deletion queue
     let (deletion_queue, deletion_workers) = DeletionQueue::new(
         remote_storage.clone(),
-        StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?,
+        StorageControllerUpcallClient::new(conf, &shutdown_pageserver),
         conf,
     );
     deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 95143e58b7..ded2805602 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -150,7 +150,7 @@ pub struct PageServerConf {
     /// not terrible.
     pub background_task_maximum_delay: Duration,
 
-    pub control_plane_api: Option<Url>,
+    pub control_plane_api: Url,
 
     /// JWT token for use with the control plane API.
     pub control_plane_api_token: Option<SecretString>,
@@ -438,7 +438,8 @@ impl PageServerConf {
             test_remote_failures,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
-            control_plane_api,
+            control_plane_api: control_plane_api
+                .ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?,
             control_plane_emergency_mode,
             heatmap_upload_concurrency,
             secondary_download_concurrency,
@@ -573,6 +574,7 @@ impl PageServerConf {
             background_task_maximum_delay: Duration::ZERO,
             load_previous_heatmap: Some(true),
             generate_unarchival_heatmap: Some(true),
+            control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
             ..Default::default()
         };
         PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
@@ -641,9 +643,12 @@ mod tests {
     use super::PageServerConf;
 
     #[test]
-    fn test_empty_config_toml_is_valid() {
-        // we use Default impl of everything in this situation
+    fn test_minimal_config_toml_is_valid() {
+        // The minimal valid config for running a pageserver:
+        // - control_plane_api is mandatory, as pageservers cannot run in isolation
+        // - we use Default impl of everything else in this situation
         let input = r#"
+            control_plane_api = "http://localhost:6666"
         "#;
         let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
             .expect("empty config is valid");
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 59c94f1549..468e5463b0 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -58,14 +58,8 @@ pub trait StorageControllerUpcallApi {
 impl StorageControllerUpcallClient {
     /// A None return value indicates that the input `conf` object does not have control
     /// plane API enabled.
-    pub fn new(
-        conf: &'static PageServerConf,
-        cancel: &CancellationToken,
-    ) -> Result<Option<Self>, reqwest::Error> {
-        let mut url = match conf.control_plane_api.as_ref() {
-            Some(u) => u.clone(),
-            None => return Ok(None),
-        };
+    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self {
+        let mut url = conf.control_plane_api.clone();
 
         if let Ok(mut segs) = url.path_segments_mut() {
             // This ensures that `url` ends with a slash if it doesn't already.
@@ -85,15 +79,17 @@ impl StorageControllerUpcallClient {
         }
 
         for cert in &conf.ssl_ca_certs {
-            client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
+            client = client.add_root_certificate(
+                Certificate::from_der(cert.contents()).expect("Invalid certificate in config"),
+            );
         }
 
-        Ok(Some(Self {
-            http_client: client.build()?,
+        Self {
+            http_client: client.build().expect("Failed to construct HTTP client"),
             base_url: url,
             node_id: conf.id,
             cancel: cancel.clone(),
-        }))
+        }
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 6dd7d741c1..4d62bc4ab5 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -585,7 +585,7 @@ impl DeletionQueue {
     /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
     pub fn new<C>(
         remote_storage: GenericRemoteStorage,
-        controller_upcall_client: Option<C>,
+        controller_upcall_client: C,
         conf: &'static PageServerConf,
     ) -> (Self, DeletionQueueWorkers<C>)
     where
@@ -701,7 +701,7 @@ mod test {
         async fn restart(&mut self) {
             let (deletion_queue, workers) = DeletionQueue::new(
                 self.storage.clone(),
-                Some(self.mock_control_plane.clone()),
+                self.mock_control_plane.clone(),
                 self.harness.conf,
             );
 
@@ -821,11 +821,8 @@ mod test {
 
         let mock_control_plane = MockStorageController::new();
 
-        let (deletion_queue, worker) = DeletionQueue::new(
-            storage.clone(),
-            Some(mock_control_plane.clone()),
-            harness.conf,
-        );
+        let (deletion_queue, worker) =
+            DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf);
 
         let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
 
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
index 4e775f15eb..363b1427f5 100644
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -53,7 +53,7 @@ where
     tx: tokio::sync::mpsc::Sender<DeleterMessage>,
 
     // Client for calling into control plane API for validation of deletes
-    controller_upcall_client: Option<C>,
+    controller_upcall_client: C,
 
     // DeletionLists which are waiting generation validation.  Not safe to
     // execute until [`validate`] has processed them.
@@ -86,7 +86,7 @@ where
         conf: &'static PageServerConf,
         rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
         tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        controller_upcall_client: Option<C>,
+        controller_upcall_client: C,
         lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
         cancel: CancellationToken,
     ) -> Self {
@@ -137,20 +137,16 @@ where
             return Ok(());
         }
 
-        let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
-            match controller_upcall_client
-                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
-                .await
-            {
-                Ok(tenants) => tenants,
-                Err(RetryForeverError::ShuttingDown) => {
-                    // The only way a validation call returns an error is when the cancellation token fires
-                    return Err(DeletionQueueError::ShuttingDown);
-                }
+        let tenants_valid = match self
+            .controller_upcall_client
+            .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+            .await
+        {
+            Ok(tenants) => tenants,
+            Err(RetryForeverError::ShuttingDown) => {
+                // The only way a validation call returns an error is when the cancellation token fires
+                return Err(DeletionQueueError::ShuttingDown);
             }
-        } else {
-            // Control plane API disabled.  In legacy mode we consider everything valid.
-            tenant_generations.keys().map(|k| (*k, true)).collect()
         };
 
         let mut validated_sequence: Option<u64> = None;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 698579e8fb..dcd043c4a1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4254,9 +4254,7 @@ impl TenantShard {
         deletion_queue_client: DeletionQueueClient,
         l0_flush_global_state: L0FlushGlobalState,
     ) -> TenantShard {
-        debug_assert!(
-            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
-        );
+        assert!(!attached_conf.location.generation.is_none());
 
         let (state, mut rx) = watch::channel(state);
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2ae7e1e875..86aef9b42c 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -346,7 +346,8 @@ async fn init_load_generations(
             "Emergency mode!  Tenants will be attached unsafely using their last known generation"
         );
         emergency_generations(tenant_confs)
-    } else if let Some(client) = StorageControllerUpcallClient::new(conf, cancel)? {
+    } else {
+        let client = StorageControllerUpcallClient::new(conf, cancel);
         info!("Calling {} API to re-attach tenants", client.base_url());
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
         match client.re_attach(conf).await {
@@ -360,9 +361,6 @@ async fn init_load_generations(
                 anyhow::bail!("Shut down while waiting for control plane re-attach response")
             }
         }
-    } else {
-        info!("Control plane API not configured, tenant generations are disabled");
-        return Ok(None);
     };
 
     // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
@@ -1153,17 +1151,8 @@ impl TenantManager {
                 // Testing hack: if we are configured with no control plane, then drop the generation
                 // from upserts.  This enables creating generation-less tenants even though neon_local
                 // always uses generations when calling the location conf API.
-                let attached_conf = if cfg!(feature = "testing") {
-                    let mut conf = AttachedTenantConf::try_from(new_location_config)
-                        .map_err(UpsertLocationError::BadRequest)?;
-                    if self.conf.control_plane_api.is_none() {
-                        conf.location.generation = Generation::none();
-                    }
-                    conf
-                } else {
-                    AttachedTenantConf::try_from(new_location_config)
-                        .map_err(UpsertLocationError::BadRequest)?
-                };
+                let attached_conf = AttachedTenantConf::try_from(new_location_config)
+                    .map_err(UpsertLocationError::BadRequest)?;
 
                 let tenant = tenant_spawn(
                     self.conf,
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index b917fdbfd8..6ab6b90cb6 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -163,8 +163,7 @@ pub async fn doit(
         // Ensure at-least-once delivery of the upcall to storage controller
         // before we mark the task as done and never come here again.
         //
-        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel)?
-            .expect("storcon configured");
+        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);
         storcon_client
             .put_timeline_import_status(
                 timeline.tenant_shard_id,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1d668d4b2d..b93df4ede4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1194,8 +1194,7 @@ class NeonEnv:
         else:
             cfg["broker"]["listen_addr"] = self.broker.listen_addr()
 
-        if self.control_plane_api is not None:
-            cfg["control_plane_api"] = self.control_plane_api
+        cfg["control_plane_api"] = self.control_plane_api
 
         if self.control_plane_hooks_api is not None:
             cfg["control_plane_hooks_api"] = self.control_plane_hooks_api
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index fa1cd61206..e3f9982486 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -3,7 +3,7 @@
 Tests in this module exercise the pageserver's behavior around generation numbers,
 as defined in docs/rfcs/025-generation-numbers.md.  Briefly, the behaviors we require
 of the pageserver are:
-- Do not start a tenant without a generation number if control_plane_api is set
+- Do not start a tenant without a generation number
 - Remote objects must be suffixed with generation
 - Deletions may only be executed after validating generation
 - Updates to remote_consistent_lsn may only be made visible after validating generation

From 6d6b83e737b5b89796f6d84e06a3f7f420bccd6c Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 28 Apr 2025 21:17:03 +0300
Subject: [PATCH 13/76] Prewarm implementation (#11741)

## Problem

Continue work on prewarm started in PR
https://github.com/neondatabase/neon/pull/11740

## Summary of changes

Implement prewarm using prefetch

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 389 ++++++++++++++++++++++++++++++++++++++++-
 pgxn/neon/file_cache.h |  14 ++
 2 files changed, 397 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index e2c1f7682f..924e0055c1 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -98,7 +98,6 @@
 #define MB					((uint64)1024*1024)
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log))
-
 #define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1))
 
 /*
@@ -135,6 +134,15 @@ typedef struct FileCacheEntry
 #define N_COND_VARS 	64
 #define CV_WAIT_TIMEOUT	10
 
+#define MAX_PREWARM_WORKERS 8
+
+typedef struct PrewarmWorkerState
+{
+	uint32		prewarmed_pages;
+	uint32		skipped_pages;
+	TimestampTz completed;
+} PrewarmWorkerState;
+
 typedef struct FileCacheControl
 {
 	uint64		generation;		/* generation is needed to handle correct hash
@@ -156,25 +164,43 @@ typedef struct FileCacheControl
 	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
+	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
+	size_t n_prewarm_workers;
+	size_t n_prewarm_entries;
+	size_t total_prewarm_pages;
+	size_t prewarm_batch;
+	bool   prewarm_active;
+	bool   prewarm_canceled;
+	dsm_handle prewarm_lfc_state_handle;
 } FileCacheControl;
 
-bool lfc_store_prefetch_result;
+#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
+
+#define FILE_CACHE_STATE_BITMAP(fcs)	((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
+#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
+#define FILE_CACHE_STATE_SIZE(fcs)		(sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
 
 static HTAB *lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
+static int	lfc_prewarm_limit;
+static int	lfc_prewarm_batch;
 static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
 static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
 static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
+static bool lfc_do_prewarm;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
 
+bool lfc_store_prefetch_result;
+bool lfc_prewarm_update_ws_estimation;
+
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
 /*
@@ -500,6 +526,17 @@ lfc_init(void)
 							NULL,
 							NULL);
 
+	DefineCustomBoolVariable("neon.prewarm_update_ws_estimation",
+							"Consider prewarmed pages for working set estimation",
+							NULL,
+							&lfc_prewarm_update_ws_estimation,
+							true,
+							PGC_SUSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	DefineCustomIntVariable("neon.max_file_cache_size",
 							"Maximal size of Neon local file cache",
 							NULL,
@@ -550,6 +587,32 @@ lfc_init(void)
 							lfc_change_chunk_size,
 							NULL);
 
+	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
+							"Maximal number of prewarmed chunks",
+							NULL,
+							&lfc_prewarm_limit,
+							INT_MAX,	/* no limit by default */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("neon.file_cache_prewarm_batch",
+							"Number of pages retrivied by prewarm from page server",
+							NULL,
+							&lfc_prewarm_batch,
+							64,
+							1,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	if (lfc_max_size == 0)
 		return;
 
@@ -563,6 +626,314 @@ lfc_init(void)
 #endif
 }
 
+FileCacheState*
+lfc_get_state(size_t max_entries)
+{
+	FileCacheState* fcs = NULL;
+
+	if (lfc_maybe_disabled() || max_entries == 0)	/* fast exit if file cache is disabled */
+		return NULL;
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+
+	if (LFC_ENABLED())
+	{
+		dlist_iter iter;
+		size_t i = 0;
+		uint8* bitmap;
+		size_t n_pages = 0;
+		size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned);
+		size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries);
+		fcs = (FileCacheState*)palloc0(state_size);
+		SET_VARSIZE(fcs, state_size);
+		fcs->magic = FILE_CACHE_STATE_MAGIC;
+		fcs->chunk_size_log = lfc_chunk_size_log;
+		fcs->n_chunks = n_entries;
+		bitmap = FILE_CACHE_STATE_BITMAP(fcs);
+
+		dlist_reverse_foreach(iter, &lfc_ctl->lru)
+		{
+			FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
+			fcs->chunks[i] = entry->key;
+			for (int j = 0; j < lfc_blocks_per_chunk; j++)
+			{
+				if (GET_STATE(entry, j) != UNAVAILABLE)
+				{
+					BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
+					n_pages += 1;
+				}
+			}
+			if (++i == n_entries)
+				break;
+		}
+		Assert(i == n_entries);
+		fcs->n_pages = n_pages;
+		Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
+		elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
+	}
+
+	LWLockRelease(lfc_lock);
+
+	return fcs;
+}
+
+/*
+ * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock
+ * and avoid race conditions with other backends.
+ */
+void
+lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
+{
+	size_t fcs_chunk_size_log;
+	size_t n_entries;
+	size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
+	size_t fcs_size;
+	dsm_segment *seg;
+	BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
+
+
+	if (!lfc_ensure_opened())
+		return;
+
+	if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
+	{
+		elog(LOG, "LFC: prewarm is disabled");
+		return;
+	}
+
+	if (n_workers > MAX_PREWARM_WORKERS)
+	{
+		elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
+	}
+
+	if (fcs == NULL || fcs->n_chunks == 0)
+	{
+		elog(LOG, "LFC: nothing to prewarm");
+		return;
+	}
+
+	if (fcs->magic != FILE_CACHE_STATE_MAGIC)
+	{
+		elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
+	}
+
+	fcs_size = VARSIZE(fcs);
+	if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
+	{
+		elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
+	}
+
+	fcs_chunk_size_log = fcs->chunk_size_log;
+	if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
+	{
+		elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
+	}
+
+	n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
+	Assert(n_entries != 0);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	/* Do not prewarm more entries than LFC limit */
+	if (lfc_ctl->limit <= lfc_ctl->size)
+	{
+		elog(LOG, "LFC: skip prewarm because LFC is already filled");
+		LWLockRelease(lfc_lock);
+		return;
+	}
+
+	if (lfc_ctl->prewarm_active)
+	{
+		LWLockRelease(lfc_lock);
+		elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
+	}
+	lfc_ctl->n_prewarm_entries = n_entries;
+	lfc_ctl->n_prewarm_workers = n_workers;
+	lfc_ctl->prewarm_active = true;
+	lfc_ctl->prewarm_canceled = false;
+	lfc_ctl->prewarm_batch = prewarm_batch;
+	memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
+
+	LWLockRelease(lfc_lock);
+
+	/* Calculate total number of pages to be prewarmed */
+	lfc_ctl->total_prewarm_pages = fcs->n_pages;
+
+	seg = dsm_create(fcs_size, 0);
+	memcpy(dsm_segment_address(seg), fcs, fcs_size);
+	lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
+
+	/* Spawn background workers */
+	for (uint32 i = 0; i < n_workers; i++)
+	{
+		BackgroundWorker worker = {0};
+
+		worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
+		worker.bgw_start_time = BgWorkerStart_ConsistentState;
+		worker.bgw_restart_time = BGW_NEVER_RESTART;
+		strcpy(worker.bgw_library_name, "neon");
+		strcpy(worker.bgw_function_name, "lfc_prewarm_main");
+		snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
+		strcpy(worker.bgw_type, "LFC prewarm worker");
+		worker.bgw_main_arg = Int32GetDatum(i);
+		/* must set notify PID to wait for shutdown */
+		worker.bgw_notify_pid = MyProcPid;
+
+		if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
+		{
+			ereport(LOG,
+					(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+					 errmsg("LFC: registering dynamic bgworker prewarm failed"),
+					 errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
+			n_workers = i;
+			lfc_ctl->prewarm_canceled = true;
+			break;
+		}
+	}
+
+	for (uint32 i = 0; i < n_workers; i++)
+	{
+		while (true)
+		{
+			PG_TRY();
+			{
+				BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
+				if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
+				{
+					elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
+				}
+				break;
+			}
+			PG_CATCH();
+			{
+				elog(LOG, "LFC: cancel prewarm");
+				lfc_ctl->prewarm_canceled = true;
+			}
+			PG_END_TRY();
+		}
+		if (!lfc_ctl->prewarm_workers[i].completed)
+		{
+			/* Background worker doesn't set completion time: it means that it was abnormally terminated */
+			elog(LOG, "LFC: prewarm worker %d failed", i+1);
+			/* Set completion time to prevent get_prewarm_info from considering this worker as active */
+			lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
+		}
+	}
+	dsm_detach(seg);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	lfc_ctl->prewarm_active = false;
+	LWLockRelease(lfc_lock);
+}
+
+void
+lfc_prewarm_main(Datum main_arg)
+{
+	size_t snd_idx = 0, rcv_idx = 0;
+	size_t n_sent = 0, n_received = 0;
+	size_t fcs_chunk_size_log;
+	size_t max_prefetch_pages;
+	size_t prewarm_batch;
+	size_t n_workers;
+	dsm_segment *seg;
+	FileCacheState* fcs;
+	uint8* bitmap;
+	BufferTag tag;
+	PrewarmWorkerState* ws;
+	uint32 worker_id = DatumGetInt32(main_arg);
+
+	pqsignal(SIGTERM, die);
+	BackgroundWorkerUnblockSignals();
+
+	seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle);
+	if (seg == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not map dynamic shared memory segment")));
+
+	fcs = (FileCacheState*) dsm_segment_address(seg);
+	prewarm_batch = lfc_ctl->prewarm_batch;
+	fcs_chunk_size_log = fcs->chunk_size_log;
+	n_workers = lfc_ctl->n_prewarm_workers;
+	max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log;
+	ws = &lfc_ctl->prewarm_workers[worker_id];
+	bitmap = FILE_CACHE_STATE_BITMAP(fcs);
+
+	/* enable prefetch in LFC */
+	lfc_store_prefetch_result = true;
+	lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
+
+	elog(LOG, "LFC: worker %d start prewarming", worker_id);
+	while (!lfc_ctl->prewarm_canceled)
+	{
+		if (snd_idx < max_prefetch_pages)
+		{
+			if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
+			{
+				/* If there are multiple workers, split chunks between them */
+				snd_idx += 1 << fcs_chunk_size_log;
+			}
+			else
+			{
+				if (BITMAP_ISSET(bitmap, snd_idx))
+				{
+					tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
+					tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
+					if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
+					{
+						(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
+						n_sent += 1;
+					}
+					else
+					{
+						ws->skipped_pages += 1;
+						BITMAP_CLR(bitmap, snd_idx);
+					}
+				}
+				snd_idx += 1;
+			}
+		}
+		if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
+		{
+			if (n_received == n_sent && snd_idx == max_prefetch_pages)
+			{
+				break;
+			}
+			if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
+			{
+				/* Skip chunks processed by other workers */
+				rcv_idx += 1 << fcs_chunk_size_log;
+				continue;
+			}
+
+			/* Locate next block to prefetch */
+			while (!BITMAP_ISSET(bitmap, rcv_idx))
+			{
+				rcv_idx += 1;
+			}
+			tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
+			tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
+			if (communicator_prefetch_receive(tag))
+			{
+				ws->prewarmed_pages += 1;
+			}
+			else
+			{
+				ws->skipped_pages += 1;
+			}
+			rcv_idx += 1;
+			n_received += 1;
+		}
+	}
+	/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
+	 * connection to PS dropped just after return from this function.
+	 */
+	Assert(n_sent == n_received || lfc_ctl->prewarm_canceled);
+	elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
+	lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
+}
+
+
 /*
  * Check if page is present in the cache.
  * Returns true if page is found in local cache.
@@ -1001,8 +1372,11 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 	 * If we can't (e.g. because all other slots are being accessed)
 	 * then we will remove this entry from the hash and continue
 	 * on to the next chunk, as we may not exceed the limit.
+	 *
+	 * While prewarming LFC we do not want to replace existed entries,
+	 * so we just stop prewarm is LFC cache is full.
 	 */
-	else if (!dlist_is_empty(&lfc_ctl->lru))
+	else if (!dlist_is_empty(&lfc_ctl->lru) && !lfc_do_prewarm)
 	{
 		/* Cache overflow: evict least recently used chunk */
 		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
@@ -1026,6 +1400,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 		/* Can't add this chunk - we don't have the space for it */
 		hash_search_with_hash_value(lfc_hash, &entry->key, hash,
 									HASH_REMOVE, NULL);
+		lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
 		return false;
 	}
 
@@ -1112,9 +1487,11 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 
 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
 
-	tag.blockNum = blkno;
-	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-
+	if (lfc_prewarm_update_ws_estimation)
+	{
+		tag.blockNum = blkno;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 	if (found)
 	{
 		state = GET_STATE(entry, chunk_offs);
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
index 849558b83d..c7b6b09f72 100644
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -13,6 +13,17 @@
 
 #include "neon_pgversioncompat.h"
 
+typedef struct FileCacheState
+{
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	uint32		magic;
+	uint32		n_chunks;
+	uint32		n_pages;
+	uint16		chunk_size_log;
+	BufferTag	chunks[FLEXIBLE_ARRAY_MEMBER];
+	/* followed by bitmap */
+} FileCacheState;
+
 /* GUCs */
 extern bool lfc_store_prefetch_result;
 
@@ -32,7 +43,10 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
 extern void lfc_init(void);
 extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 						 const void* buffer, XLogRecPtr lsn);
+extern FileCacheState* lfc_get_state(size_t max_entries);
+extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
 
+PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
 
 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

From c1ff7db1874c6b5ff8ee134b944f1c54616ba37b Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 28 Apr 2025 14:54:26 -0400
Subject: [PATCH 14/76] fix(pageserver): consider tombstones in replorigin
 (#11752)

## Problem

We didn't consider tombstones in replorigin read path in the past. This
was fine because tombstones are stored as LSN::Invalid before we
universally define what the tombstone is for sparse keyspaces.

Now we remove non-inherited keys during detach ancestor and write the
universal tombstone "empty image". So we need to consider it across all
the read paths.

related: https://github.com/neondatabase/neon/pull/11299

## Summary of changes

Empty value gets ignored for replorigin scans.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs           | 16 +++++-
 pageserver/src/tenant.rs                      | 52 ++++++++++++++++++-
 .../src/tenant/timeline/detach_ancestor.rs    |  2 +-
 3 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 81e548a095..ccb48d8bc1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1084,8 +1084,17 @@ impl Timeline {
         let mut result = HashMap::new();
         for (k, v) in kv {
             let v = v?;
+            if v.is_empty() {
+                // This is a tombstone -- we can skip it.
+                // Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of
+                // the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone,
+                // we also need to consider that. Such tombstones might be written on the detach ancestor code path to
+                // avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.)
+                continue;
+            }
             let origin_id = k.field6 as RepOriginId;
-            let origin_lsn = Lsn::des(&v).unwrap();
+            let origin_lsn = Lsn::des(&v)
+                .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?;
             if origin_lsn != Lsn::INVALID {
                 result.insert(origin_id, origin_lsn);
             }
@@ -2578,6 +2587,11 @@ impl DatadirModification<'_> {
         }
     }
 
+    #[cfg(test)]
+    pub fn put_for_unit_test(&mut self, key: Key, val: Value) {
+        self.put(key, val);
+    }
+
     fn put(&mut self, key: Key, val: Value) {
         if Self::is_data_key(&key) {
             self.put_data(key.to_compact(), val)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dcd043c4a1..e59db74479 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5947,7 +5947,9 @@ mod tests {
     use itertools::Itertools;
     #[cfg(feature = "testing")]
     use models::CompactLsnRange;
-    use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
+    use pageserver_api::key::{
+        AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key,
+    };
     use pageserver_api::keyspace::KeySpace;
     #[cfg(feature = "testing")]
     use pageserver_api::keyspace::KeySpaceRandomAccum;
@@ -8183,6 +8185,54 @@ mod tests {
         assert_eq!(files.get("pg_logical/mappings/test2"), None);
     }
 
+    #[tokio::test]
+    async fn test_repl_origin_tombstones() {
+        let harness = TenantHarness::create("test_repl_origin_tombstones")
+            .await
+            .unwrap();
+
+        let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let repl_lsn = Lsn(0x10);
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new()));
+            modification.set_replorigin(1, repl_lsn).await.unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // we can read everything from the storage
+        let repl_origins = tline
+            .get_replorigins(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
+        assert_eq!(repl_origins.len(), 1);
+        assert_eq!(repl_origins[&1], lsn);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification.put_for_unit_test(
+                repl_origin_key(3),
+                Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")),
+            );
+            modification.commit(&ctx).await.unwrap();
+        }
+        let result = tline
+            .get_replorigins(lsn, &ctx, io_concurrency.clone())
+            .await;
+        assert!(result.is_err());
+    }
+
     #[tokio::test]
     async fn test_metadata_image_creation() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_metadata_image_creation").await?;
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 8e95c3a8ff..649b33e294 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -178,7 +178,7 @@ impl Attempt {
     }
 }
 
-async fn generate_tombstone_image_layer(
+pub(crate) async fn generate_tombstone_image_layer(
     detached: &Arc<Timeline>,
     ancestor: &Arc<Timeline>,
     ancestor_lsn: Lsn,

From 9e8ab2ab4f653d1ca96869a1c52d4aec0deb202c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 28 Apr 2025 14:13:35 -0500
Subject: [PATCH 15/76] Skip remote extensions WITH_LIB test when sanitizers
 are enabled (#11758)

In order for the test to work when sanitizers are enabled, we would need
to compile the dummy Postgres extension with the same sanitizer flags
that we compile Postgres and the neon extension with. Doing this work
would be a little more than trivial, so skipping is the best option, at
least for now.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/regress/test_download_extensions.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 3b6c94a268..d28240c722 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -14,7 +14,7 @@ from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
 from fixtures.paths import BASE_DIR
 from fixtures.pg_config import PgConfigKey
-from fixtures.utils import subprocess_capture
+from fixtures.utils import WITH_SANITIZERS, subprocess_capture
 from werkzeug.wrappers.response import Response
 
 if TYPE_CHECKING:
@@ -148,6 +148,15 @@ def test_remote_extensions(
     pg_config: PgConfig,
     extension: RemoteExtension,
 ):
+    if WITH_SANITIZERS and extension is RemoteExtension.WITH_LIB:
+        pytest.skip(
+            """
+            For this test to work with sanitizers enabled, we would need to
+            compile the dummy Postgres extension with the same CFLAGS that we
+            compile Postgres and the neon extension with to link the sanitizers.
+            """
+        )
+
     # Setup a mock nginx S3 gateway which will return our test extension.
     (host, port) = httpserver_listen_address
     extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"

From 3593356c1055f8462011542c561d90b02c84e4eb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 29 Apr 2025 09:44:28 +0300
Subject: [PATCH 16/76] Prewarm sql api (#11742)

## Problem

Continue work on prewarm, see
https://github.com/neondatabase/neon/pull/11740
https://github.com/neondatabase/neon/pull/11741

## Summary of changes

Add SQL API to prewarm

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/Makefile                      |   2 +
 pgxn/neon/file_cache.c                  |  88 +++++++++++++-
 pgxn/neon/neon--1.5--1.6.sql            |  22 ++++
 pgxn/neon/neon--1.6--1.5.sql            |   7 ++
 test_runner/regress/test_lfc_prewarm.py | 147 ++++++++++++++++++++++++
 5 files changed, 263 insertions(+), 3 deletions(-)
 create mode 100644 pgxn/neon/neon--1.5--1.6.sql
 create mode 100644 pgxn/neon/neon--1.6--1.5.sql
 create mode 100644 test_runner/regress/test_lfc_prewarm.py

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 426b176af9..8bcc6bf924 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -36,6 +36,8 @@ DATA = \
 	neon--1.2--1.3.sql \
 	neon--1.3--1.4.sql \
 	neon--1.4--1.5.sql \
+	neon--1.5--1.6.sql \
+	neon--1.6--1.5.sql \
 	neon--1.5--1.4.sql \
 	neon--1.4--1.3.sql \
 	neon--1.3--1.2.sql \
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 924e0055c1..ecc55bb540 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -793,8 +793,10 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
 
 	for (uint32 i = 0; i < n_workers; i++)
 	{
-		while (true)
+		bool interrupted;
+		do
 		{
+			interrupted = false;
 			PG_TRY();
 			{
 				BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
@@ -802,15 +804,16 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
 				{
 					elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
 				}
-				break;
 			}
 			PG_CATCH();
 			{
 				elog(LOG, "LFC: cancel prewarm");
 				lfc_ctl->prewarm_canceled = true;
+				interrupted = true;
 			}
 			PG_END_TRY();
-		}
+		} while (interrupted);
+
 		if (!lfc_ctl->prewarm_workers[i].completed)
 		{
 			/* Background worker doesn't set completion time: it means that it was abnormally terminated */
@@ -2125,3 +2128,82 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 	}
 	PG_RETURN_NULL();
 }
+
+PG_FUNCTION_INFO_V1(get_local_cache_state);
+
+Datum
+get_local_cache_state(PG_FUNCTION_ARGS)
+{
+	size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
+	FileCacheState* fcs = lfc_get_state(max_entries);
+	if (fcs != NULL)
+		PG_RETURN_BYTEA_P((bytea*)fcs);
+	else
+		PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(prewarm_local_cache);
+
+Datum
+prewarm_local_cache(PG_FUNCTION_ARGS)
+{
+	bytea* state = PG_GETARG_BYTEA_PP(0);
+	uint32 n_workers =  PG_GETARG_INT32(1);
+	FileCacheState* fcs = (FileCacheState*)state;
+
+	lfc_prewarm(fcs, n_workers);
+
+	PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(get_prewarm_info);
+
+Datum
+get_prewarm_info(PG_FUNCTION_ARGS)
+{
+	Datum		values[4];
+	bool		nulls[4];
+	TupleDesc	tupdesc;
+	uint32 prewarmed_pages = 0;
+	uint32 skipped_pages = 0;
+	uint32 active_workers = 0;
+	uint32 total_pages;
+	size_t n_workers;
+
+	if (lfc_size_limit == 0)
+		PG_RETURN_NULL();
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0)
+	{
+		LWLockRelease(lfc_lock);
+		PG_RETURN_NULL();
+	}
+	n_workers = lfc_ctl->n_prewarm_workers;
+	total_pages = lfc_ctl->total_prewarm_pages;
+	for (size_t i = 0; i < n_workers; i++)
+	{
+		PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i];
+		prewarmed_pages += ws->prewarmed_pages;
+		skipped_pages += ws->skipped_pages;
+		active_workers += ws->completed != 0;
+	}
+	LWLockRelease(lfc_lock);
+
+	tupdesc = CreateTemplateTupleDesc(4);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	MemSet(nulls, 0, sizeof(nulls));
+
+	values[0] = Int32GetDatum(total_pages);
+	values[1] = Int32GetDatum(prewarmed_pages);
+	values[2] = Int32GetDatum(skipped_pages);
+	values[3] = Int32GetDatum(active_workers);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
diff --git a/pgxn/neon/neon--1.5--1.6.sql b/pgxn/neon/neon--1.5--1.6.sql
new file mode 100644
index 0000000000..c05f0f87aa
--- /dev/null
+++ b/pgxn/neon/neon--1.5--1.6.sql
@@ -0,0 +1,22 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit
+
+CREATE FUNCTION get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer)
+RETURNS record
+AS 'MODULE_PATHNAME', 'get_prewarm_info'
+LANGUAGE C STRICT
+PARALLEL SAFE;
+
+CREATE FUNCTION get_local_cache_state(max_chunks integer default null)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_local_cache_state'
+LANGUAGE C
+PARALLEL UNSAFE;
+
+CREATE FUNCTION prewarm_local_cache(state bytea, n_workers integer default 1)
+RETURNS void
+AS 'MODULE_PATHNAME', 'prewarm_local_cache'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+
+
diff --git a/pgxn/neon/neon--1.6--1.5.sql b/pgxn/neon/neon--1.6--1.5.sql
new file mode 100644
index 0000000000..57512980f5
--- /dev/null
+++ b/pgxn/neon/neon--1.6--1.5.sql
@@ -0,0 +1,7 @@
+DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer);
+
+DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer);
+
+DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1);
+
+
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
new file mode 100644
index 0000000000..dd0ae1921d
--- /dev/null
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -0,0 +1,147 @@
+import random
+import threading
+import time
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import USE_LFC
+
+
+def check_pinned_entries(cur):
+    # some LFC buffer can be temporary locked by autovacuum or background writer
+    for _ in range(10):
+        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
+        n_pinned = cur.fetchall()[0][0]
+        if n_pinned == 0:
+            break
+        time.sleep(1)
+    assert n_pinned == 0
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_lfc_prewarm(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    n_records = 1000000
+
+    endpoint = env.endpoints.create_start(
+        branch_name="main",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers=1MB",
+            "neon.max_file_cache_size=1GB",
+            "neon.file_cache_size_limit=1GB",
+            "neon.file_cache_prewarm_limit=1000",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("create extension neon version '1.6'")
+    cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))")
+    cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
+    cur.execute("select get_local_cache_state()")
+    lfc_state = cur.fetchall()[0][0]
+
+    endpoint.stop()
+    endpoint.start()
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    time.sleep(1)  # wait until compute_ctl complete downgrade of extension to default version
+    cur.execute("alter extension neon update to '1.6'")
+    cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+
+    cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
+    lfc_used_pages = cur.fetchall()[0][0]
+    log.info(f"Used LFC size: {lfc_used_pages}")
+    cur.execute("select * from get_prewarm_info()")
+    prewarm_info = cur.fetchall()[0]
+    log.info(f"Prewarm info: {prewarm_info}")
+    log.info(f"Prewarm progress: {(prewarm_info[1] + prewarm_info[2]) * 100 // prewarm_info[0]}%")
+
+    assert lfc_used_pages > 10000
+    assert (
+        prewarm_info[0] > 0
+        and prewarm_info[1] > 0
+        and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
+    )
+
+    cur.execute("select sum(pk) from t")
+    assert cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
+
+    check_pinned_entries(cur)
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    n_records = 10000
+    n_threads = 4
+
+    endpoint = env.endpoints.create_start(
+        branch_name="main",
+        config_lines=[
+            "shared_buffers=1MB",
+            "neon.max_file_cache_size=1GB",
+            "neon.file_cache_size_limit=1GB",
+            "neon.file_cache_prewarm_limit=1000000",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("create extension neon version '1.6'")
+    cur.execute(
+        "create table accounts(id integer primary key, balance bigint default 0, payload text default repeat('?', 1000)) with (fillfactor=10)"
+    )
+    cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))")
+    cur.execute("select get_local_cache_state()")
+    lfc_state = cur.fetchall()[0][0]
+
+    running = True
+
+    def workload():
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        n_transfers = 0
+        while running:
+            src = random.randint(1, n_records)
+            dst = random.randint(1, n_records)
+            cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
+            cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
+            n_transfers += 1
+        log.info(f"Number of transfers: {n_transfers}")
+
+    def prewarm():
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        n_prewarms = 0
+        while running:
+            cur.execute("alter system set neon.file_cache_size_limit='1MB'")
+            cur.execute("select pg_reload_conf()")
+            cur.execute("alter system set neon.file_cache_size_limit='1GB'")
+            cur.execute("select pg_reload_conf()")
+            cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+            n_prewarms += 1
+        log.info(f"Number of prewarms: {n_prewarms}")
+
+    workload_threads = []
+    for _ in range(n_threads):
+        t = threading.Thread(target=workload)
+        workload_threads.append(t)
+        t.start()
+
+    prewarm_thread = threading.Thread(target=prewarm)
+    prewarm_thread.start()
+
+    time.sleep(20)
+
+    running = False
+    for t in workload_threads:
+        t.join()
+    prewarm_thread.join()
+
+    cur.execute("select sum(balance) from accounts")
+    total_balance = cur.fetchall()[0][0]
+    assert total_balance == 0
+
+    check_pinned_entries(cur)

From d15f2ff57a3a9aefe0a10ea034d977e83b90504f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 29 Apr 2025 10:56:44 +0200
Subject: [PATCH 17/76] fix(lint-release-pr): adjust lint and action to match
 (#11766)

## Problem
The `lint-release-pr` workflow run for
https://github.com/neondatabase/neon/pull/11763 failed, because the new
action did not match the lint.

## Summary of changes
Include time in expected merge message regex.
---
 .github/scripts/lint-release-pr.sh | 2 +-
 .github/workflows/release.yml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh
index 6dc5b99f0e..d3badf9562 100755
--- a/.github/scripts/lint-release-pr.sh
+++ b/.github/scripts/lint-release-pr.sh
@@ -41,7 +41,7 @@ echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}"
 LAST_COMMIT=$(git rev-parse HEAD)
 
 MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}")
-EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$"
+EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$"
 
 if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then
   report_error "Merge commit message does not match expected pattern: '<component> release YYYY-MM-DD'
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4b19d6aa3f..0f97cf7c87 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -60,7 +60,7 @@ jobs:
           git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
 
       - name: Create release PR
-        uses: neondatabase/dev-actions/release-pr@02b41460646b70d12dd33e5f56ebc5af2384c993
+        uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677
         with:
           component: ${{ inputs.component }}
           cherry-pick: ${{ inputs.cherry-pick }}

From 7f8b1d79c015ba1158883c689d8ee230426194da Mon Sep 17 00:00:00 2001
From: Busra Kugler <busra@neon.tech>
Date: Tue, 29 Apr 2025 11:02:01 +0200
Subject: [PATCH 18/76] Replace dorny/paths-filter with step-security
 maintained version (#11663)

## Problem
Our CI/CD security tool StepSecurity maintains safer forks of popular
GitHub Actions with low security scores. We're replacing
dorny/paths-filter with the maintained step-security/paths-filter
version to reduce risk of supply chain breaches and potential CVEs.

## Summary of changes
replace
```uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 ```  with ```uses: step-security/paths-filter@v3```

This PR will fix: neondatabase/cloud#26141
---
 .github/workflows/build_and_test.yml    | 2 +-
 .github/workflows/neon_extra_builds.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1791cddacc..6c025ad2a9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:
           submodules: true
 
       - name: Check for file changes
-        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        uses: step-security/paths-filter@v3
         id: files-changed
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 79467c8f95..9c504eb5bf 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -53,7 +53,7 @@ jobs:
           submodules: true
 
       - name: Check for Postgres changes
-        uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242  #v3
+        uses: step-security/paths-filter@v3
         id: files_changed
         with:
           token: ${{ github.token }}

From 498d852bde2c1b20761f5ca588bcc64c86fce282 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 29 Apr 2025 11:12:14 +0200
Subject: [PATCH 19/76] Fix the empty region if run on schedule (#11764)

## Problem

When the workflow ran on a schedule, the `region_id` input was not set.
As a result, an empty region value was used, which caused errors during
execution.

## Summary of Changes

- Added fallback logic to set a default region (`aws-us-east-2`) when
`region_id` is not provided.
- Ensures the workflow works correctly both when triggered manually
(`workflow_dispatch`) and on schedule (`cron`).
---
 .github/workflows/cloud-extensions.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml
index 7d60469f92..4114f0f9b4 100644
--- a/.github/workflows/cloud-extensions.yml
+++ b/.github/workflows/cloud-extensions.yml
@@ -68,7 +68,7 @@ jobs:
         id: create-neon-project
         uses: ./.github/actions/neon-project-create
         with:
-          region_id: ${{ inputs.region_id }}
+          region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
           postgres_version: ${{ matrix.pg-version }}
           project_settings: ${{ steps.project-settings.outputs.settings }}
           # We need these settings to get the expected output results.

From b3db7f66ac39f072502b41fd6723e7753a0c37e6 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Tue, 29 Apr 2025 14:49:16 +0300
Subject: [PATCH 20/76] fix(compute): Change the local_proxy log level (#11770)

Related to the INC-496
---
 compute/vm-image-spec-bookworm.yaml | 2 +-
 compute/vm-image-spec-bullseye.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index ec24d73242..057099994a 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -22,7 +22,7 @@ commands:
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn
-    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index b40bdecebc..d048e20b2e 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -22,7 +22,7 @@ commands:
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn
-    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn

From 0b3592921179fee85ab28725a6601213d195ba53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 29 Apr 2025 14:46:15 +0200
Subject: [PATCH 21/76] Make SafekeeperReconciler parallel via semaphore
 (#11757)

Right now we only support running one reconciliation per safekeeper.
This is of course usually way below of what a safekeeper can do.
Therefore, introduce a semaphore and spawn the tasks asynchronously as
they come in.

Part of #11670
---
 storage_controller/src/main.rs                | 10 +++-
 storage_controller/src/service.rs             |  4 ++
 .../src/service/safekeeper_reconciler.rs      | 59 +++++++++++++------
 3 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 71dde9e126..2eea2f9d10 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -19,7 +19,8 @@ use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
     Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
     MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service,
+    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -132,6 +133,10 @@ struct Cli {
     #[arg(long)]
     priority_reconciler_concurrency: Option<usize>,
 
+    /// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper)
+    #[arg(long)]
+    safekeeper_reconciler_concurrency: Option<usize>,
+
     /// Tenant API rate limit, as requests per second per tenant.
     #[arg(long, default_value = "10")]
     tenant_rate_limit: NonZeroU32,
@@ -403,6 +408,9 @@ async fn async_main() -> anyhow::Result<()> {
         priority_reconciler_concurrency: args
             .priority_reconciler_concurrency
             .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
+        safekeeper_reconciler_concurrency: args
+            .safekeeper_reconciler_concurrency
+            .unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT),
         tenant_rate_limit: args.tenant_rate_limit,
         split_threshold: args.split_threshold,
         max_split_shards: args.max_split_shards,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0f71a87f13..50ce559cc0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -194,6 +194,7 @@ pub(crate) enum LeadershipStatus {
 
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
+pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -382,6 +383,9 @@ pub struct Config {
     /// How many high-priority Reconcilers may be spawned concurrently
     pub priority_reconciler_concurrency: usize,
 
+    /// How many safekeeper reconciles may happen concurrently (per safekeeper)
+    pub safekeeper_reconciler_concurrency: usize,
+
     /// How many API requests per second to allow per tenant, across all
     /// tenant-scoped API endpoints. Further API requests queue until ready.
     pub tenant_rate_limit: NonZeroU32,
diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs
index b15772a36c..74308cabff 100644
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -3,7 +3,10 @@ use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
 use clashmap::{ClashMap, Entry};
 use safekeeper_api::models::PullTimelineRequest;
 use safekeeper_client::mgmt_api;
-use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
+use tokio::sync::{
+    Semaphore,
+    mpsc::{self, UnboundedReceiver, UnboundedSender},
+};
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{
@@ -206,18 +209,27 @@ impl ReconcilerHandle {
 }
 
 pub(crate) struct SafekeeperReconciler {
-    service: Arc<Service>,
+    inner: SafekeeperReconcilerInner,
+    concurrency_limiter: Arc<Semaphore>,
     rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
     cancel: CancellationToken,
 }
 
+/// Thin wrapper over `Service` to not clutter its inherent functions
+#[derive(Clone)]
+struct SafekeeperReconcilerInner {
+    service: Arc<Service>,
+}
+
 impl SafekeeperReconciler {
     fn spawn(cancel: CancellationToken, service: Arc<Service>) -> ReconcilerHandle {
         // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
         let (tx, rx) = mpsc::unbounded_channel();
+        let concurrency = service.config.safekeeper_reconciler_concurrency;
         let mut reconciler = SafekeeperReconciler {
-            service,
+            inner: SafekeeperReconcilerInner { service },
             rx,
+            concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
             cancel: cancel.clone(),
         };
         let handle = ReconcilerHandle {
@@ -230,31 +242,44 @@ impl SafekeeperReconciler {
     }
     async fn run(&mut self) {
         loop {
-            // TODO add parallelism with semaphore here
             let req = tokio::select! {
                 req = self.rx.recv() => req,
                 _ = self.cancel.cancelled() => break,
             };
             let Some((req, req_cancel)) = req else { break };
+
+            let permit_res = tokio::select! {
+                req = self.concurrency_limiter.clone().acquire_owned() => req,
+                _ = self.cancel.cancelled() => break,
+            };
+            let Ok(_permit) = permit_res else { return };
+
+            let inner = self.inner.clone();
             if req_cancel.is_cancelled() {
                 continue;
             }
 
-            let kind = req.kind;
-            let tenant_id = req.tenant_id;
-            let timeline_id = req.timeline_id;
-            let node_id = req.safekeeper.skp.id;
-            self.reconcile_one(req, req_cancel)
-                .instrument(tracing::info_span!(
-                    "reconcile_one",
-                    ?kind,
-                    %tenant_id,
-                    ?timeline_id,
-                    %node_id,
-                ))
-                .await;
+            tokio::task::spawn(async move {
+                let kind = req.kind;
+                let tenant_id = req.tenant_id;
+                let timeline_id = req.timeline_id;
+                let node_id = req.safekeeper.skp.id;
+                inner
+                    .reconcile_one(req, req_cancel)
+                    .instrument(tracing::info_span!(
+                        "reconcile_one",
+                        ?kind,
+                        %tenant_id,
+                        ?timeline_id,
+                        %node_id,
+                    ))
+                    .await;
+            });
         }
     }
+}
+
+impl SafekeeperReconcilerInner {
     async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
         let req_host = req.safekeeper.skp.host.clone();
         match req.kind {

From 09247de8d508be4d65d74d1879ccdfa930ec4794 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 29 Apr 2025 13:11:24 +0000
Subject: [PATCH 22/76] proxy: Enable JSON logging by default (#11772)

This does not affect local_proxy.
---
 proxy/README.md      | 4 ++--
 proxy/src/logging.rs | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index 1156bfd352..583db36f28 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation
 (see end of this page on how to generate certs with openssl):
 
 ```
-./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
+LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
 ```
 
 If both postgres and proxy are running you may send a SQL query:
@@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key
 
 Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
 ```
 
 Now from client you can start a new session:
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index b83b03bc4f..efa3c0b514 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -132,11 +132,10 @@ impl Drop for LoggingGuard {
     }
 }
 
-// TODO: make JSON the default
 #[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
 enum LogFormat {
+    Text,
     #[default]
-    Text = 1,
     Json,
 }
 

From 768a580373f1a60fff77a8e385fef040b9c261ef Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 29 Apr 2025 15:07:23 +0100
Subject: [PATCH 23/76] pageserver: add not modified since lsn to get page span
 (#11774)

It's useful when debugging.
---
 pageserver/src/page_service.rs | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d1a210a786..0ce1a99681 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1035,10 +1035,25 @@ impl PageServerHandler {
                 // avoid a somewhat costly Span::record() by constructing the entire span in one go.
                 macro_rules! mkspan {
                     (before shard routing) => {{
-                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
+                        tracing::info_span!(
+                            parent: &parent_span,
+                            "handle_get_page_request",
+                            rel = %req.rel,
+                            blkno = %req.blkno,
+                            req_lsn = %req.hdr.request_lsn,
+                            not_modified_since_lsn = %req.hdr.not_modified_since
+                        )
                     }};
                     ($shard_id:expr) => {{
-                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
+                        tracing::info_span!(
+                            parent: &parent_span,
+                            "handle_get_page_request",
+                            rel = %req.rel,
+                            blkno = %req.blkno,
+                            req_lsn = %req.hdr.request_lsn,
+                            not_modified_since_lsn = %req.hdr.not_modified_since,
+                            shard_id = %$shard_id
+                        )
                     }};
                 }
 
@@ -1102,6 +1117,7 @@ impl PageServerHandler {
                             shard_id = %shard.get_shard_identity().shard_slug(),
                             timeline_id = %timeline_id,
                             lsn = %req.hdr.request_lsn,
+                            not_modified_since_lsn = %req.hdr.not_modified_since,
                             request_id = %req.hdr.reqid,
                             key = %key,
                             )

From a2adc7dbd380fadaa60255eac2c070243a01a7fe Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 29 Apr 2025 16:31:52 +0100
Subject: [PATCH 24/76] storcon: avoid multiple initdbs when shard 0 has stale
 locations (#11760)

## Problem

In #11727 I overlooked the case of multiple attached locations for shard
0.

I misread the code and thought `create_one` acts on one location, but it
actually acts on one _shard_, which is potentially multiple locations.

This was not a regression, but it meant that the fix was incomplete.

## Summary of changes

- In `create_one`, when updating shard zero, have any "other" locations
use the initdb from shard 0
---
 storage_controller/src/service.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 50ce559cc0..72379f0810 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3663,7 +3663,7 @@ impl Service {
                 locations: ShardMutationLocations,
                 http_client: reqwest::Client,
                 jwt: Option<String>,
-                create_req: TimelineCreateRequest,
+                mut create_req: TimelineCreateRequest,
             ) -> Result<TimelineInfo, ApiError> {
                 let latest = locations.latest.node;
 
@@ -3682,6 +3682,15 @@ impl Service {
                     .await
                     .map_err(|e| passthrough_api_error(&latest, e))?;
 
+                // If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use
+                // the initdb generated by the latest location, rather than generating their own.  This avoids racing uploads
+                // of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries.
+                if tenant_shard_id.is_shard_zero() {
+                    if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode {
+                        *existing_initdb_timeline_id = Some(create_req.new_timeline_id);
+                    }
+                }
+
                 // We propagate timeline creations to all attached locations such that a compute
                 // for the new timeline is able to start regardless of the current state of the
                 // tenant shard reconciliation.

From a08c1a23eb378be1b5b22a19737b0c7f855d30a2 Mon Sep 17 00:00:00 2001
From: Elizabeth Murray <52375559+bizwark@users.noreply.github.com>
Date: Tue, 29 Apr 2025 09:50:18 -0700
Subject: [PATCH 25/76] Upgrade the pgrag version in the compute Dockerfile.
 (#11687)

Update the compute Dockerfile to use a new version of pgrag. The new
version of pgrag uses the latest pgrx, and has a fix that terminates
background workers on postmaster exit.
---
 compute/compute-node.Dockerfile | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index b9299eee90..267940a405 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1084,7 +1084,18 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
+#########################################################################################
+#
+# Layer "rust extensions pgrx14"
+#
+#########################################################################################
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
+ARG PG_VERSION
 
+RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
 #########################################################################################
 #
 # Layers "pg-onnx-build" and "pgrag-build"
@@ -1100,11 +1111,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
     mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
     echo "#nothing to test here" > neon-test.sh
 
-RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
-    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz &&  \
+    echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
     mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
 
-FROM rust-extensions-build-pgrx12 AS pgrag-build
+FROM rust-extensions-build-pgrx14 AS pgrag-build
 COPY --from=pgrag-src /ext-src/ /ext-src/
 
 # Install build-time dependencies
@@ -1124,19 +1135,19 @@ RUN . venv/bin/activate && \
 
 WORKDIR /ext-src/pgrag-src
 RUN cd exts/rag && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control
 
 RUN cd exts/rag_bge_small_en_v15 && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
         cargo pgrx install --release --features remote_onnx && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
 
 RUN cd exts/rag_jina_reranker_v1_tiny_en && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
         cargo pgrx install --release --features remote_onnx && \

From 1d06172d59caa81e36dce0a17610d80067be4c54 Mon Sep 17 00:00:00 2001
From: "devin-ai-integration[bot]"
 <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 29 Apr 2025 19:34:56 +0100
Subject: [PATCH 26/76] pageserver: remove resident size from billing metrics
 (#11699)

This is a rebase of PR #10739 by @henryliu2014 on the current main
branch.

## Problem

pageserver: remove resident size from billing metrics

Fixes #10388

## Summary of changes

The following changes have been made to remove resident size from
billing metrics:

* removed the metric "resident_size" and related codes in
consumption_metrics/metrics.rs
* removed the item of the description of metric "resident_size" in
consumption_metrics.md
* refactored the metric "resident_size" related test case

Requested by: John Spray (john@neon.tech)

---------

Co-authored-by: liuheqing <hq.liu@qq.com>
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: John Spray <john@neon.tech>
---
 docs/consumption_metrics.md                   |  5 ---
 pageserver/src/consumption_metrics/metrics.rs | 42 ++-----------------
 .../src/consumption_metrics/metrics/tests.rs  | 10 +----
 pageserver/src/consumption_metrics/upload.rs  |  8 +---
 .../test_pageserver_metric_collection.py      |  1 -
 5 files changed, 7 insertions(+), 59 deletions(-)

diff --git a/docs/consumption_metrics.md b/docs/consumption_metrics.md
index 6bcd28ab10..eb211af646 100644
--- a/docs/consumption_metrics.md
+++ b/docs/consumption_metrics.md
@@ -38,11 +38,6 @@ Currently, the following metrics are collected:
 Amount of WAL produced , by a timeline, i.e. last_record_lsn
 This is an absolute, per-timeline metric.
 
-- `resident_size`
-
-Size of all the layer files in the tenant's directory on disk on the pageserver.
-This is an absolute, per-tenant metric.
-
 - `remote_storage_size`
 
 Size of the remote storage (S3) directory.
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 08ab69f349..acdf514101 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -30,9 +30,6 @@ pub(super) enum Name {
     /// Tenant remote size
     #[serde(rename = "remote_storage_size")]
     RemoteSize,
-    /// Tenant resident size
-    #[serde(rename = "resident_size")]
-    ResidentSize,
     /// Tenant synthetic size
     #[serde(rename = "synthetic_storage_size")]
     SyntheticSize,
@@ -187,18 +184,6 @@ impl MetricsKey {
         .absolute_values()
     }
 
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: Name::ResidentSize,
-        }
-        .absolute_values()
-    }
-
     /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
     ///
     /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
@@ -261,10 +246,7 @@ where
     let mut tenants = std::pin::pin!(tenants);
 
     while let Some((tenant_id, tenant)) = tenants.next().await {
-        let mut tenant_resident_size = 0;
-
         let timelines = tenant.list_timelines();
-        let timelines_len = timelines.len();
         for timeline in timelines {
             let timeline_id = timeline.timeline_id;
 
@@ -287,16 +269,9 @@ where
                     continue;
                 }
             }
-
-            tenant_resident_size += timeline.resident_physical_size();
         }
 
-        if timelines_len == 0 {
-            // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
-            tenant_resident_size = 1;
-        }
-
-        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
+        let snap = TenantSnapshot::collect(&tenant);
         snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
     }
 
@@ -305,19 +280,14 @@ where
 
 /// In-between abstraction to allow testing metrics without actual Tenants.
 struct TenantSnapshot {
-    resident_size: u64,
     remote_size: u64,
     synthetic_size: u64,
 }
 
 impl TenantSnapshot {
     /// Collect tenant status to have metrics created out of it.
-    ///
-    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
-    /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
+    fn collect(t: &Arc<crate::tenant::TenantShard>) -> Self {
         TenantSnapshot {
-            resident_size,
             remote_size: t.remote_size(),
             // Note that this metric is calculated in a separate bgworker
             // Here we only use cached value, which may lag behind the real latest one
@@ -334,8 +304,6 @@ impl TenantSnapshot {
     ) {
         let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
 
-        let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
-
         let synthetic_size = {
             let factory = MetricsKey::synthetic_size(tenant_id);
             let mut synthetic_size = self.synthetic_size;
@@ -355,11 +323,7 @@ impl TenantSnapshot {
             }
         };
 
-        metrics.extend(
-            [Some(remote_size), Some(resident_size), synthetic_size]
-                .into_iter()
-                .flatten(),
-        );
+        metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten());
     }
 }
 
diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs
index 52b4fb8680..5cfb361e40 100644
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -224,7 +224,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
     let tenant_id = TenantId::generate();
 
     let ts = TenantSnapshot {
-        resident_size: 1000,
         remote_size: 1000,
         // not yet calculated
         synthetic_size: 0,
@@ -245,7 +244,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
         metrics,
         &[
             MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
-            MetricsKey::resident_size(tenant_id).at(now, 1000),
             MetricsKey::synthetic_size(tenant_id).at(now, 1000),
         ]
     );
@@ -256,7 +254,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
     let tenant_id = TenantId::generate();
 
     let ts = TenantSnapshot {
-        resident_size: 1000,
         remote_size: 1000,
         // not yet calculated
         synthetic_size: 0,
@@ -274,7 +271,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
         metrics,
         &[
             MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
-            MetricsKey::resident_size(tenant_id).at(now, 1000),
             // no synthetic size here
         ]
     );
@@ -295,14 +291,13 @@ pub(crate) const fn metric_examples_old(
     timeline_id: TimelineId,
     now: DateTime<Utc>,
     before: DateTime<Utc>,
-) -> [RawMetric; 6] {
+) -> [RawMetric; 5] {
     [
         MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
         MetricsKey::written_size_delta(tenant_id, timeline_id)
             .from_until_old_format(before, now, 0),
         MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
         MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
-        MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
         MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
     ]
 }
@@ -312,13 +307,12 @@ pub(crate) const fn metric_examples(
     timeline_id: TimelineId,
     now: DateTime<Utc>,
     before: DateTime<Utc>,
-) -> [NewRawMetric; 6] {
+) -> [NewRawMetric; 5] {
     [
         MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
         MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
         MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
         MetricsKey::remote_storage_size(tenant_id).at(now, 0),
-        MetricsKey::resident_size(tenant_id).at(now, 0),
         MetricsKey::synthetic_size(tenant_id).at(now, 1),
     ]
 }
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 59e0145a5b..19c5aec5b3 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -521,10 +521,6 @@ mod tests {
                 line!(),
                 r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
             ),
-            (
-                line!(),
-                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
-            ),
             (
                 line!(),
                 r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
@@ -564,7 +560,7 @@ mod tests {
         assert_eq!(upgraded_samples, new_samples);
     }
 
-    fn metric_samples_old() -> [RawMetric; 6] {
+    fn metric_samples_old() -> [RawMetric; 5] {
         let tenant_id = TenantId::from_array([0; 16]);
         let timeline_id = TimelineId::from_array([0xff; 16]);
 
@@ -576,7 +572,7 @@ mod tests {
         super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
     }
 
-    fn metric_samples() -> [NewRawMetric; 6] {
+    fn metric_samples() -> [NewRawMetric; 5] {
         let tenant_id = TenantId::from_array([0; 16]);
         let timeline_id = TimelineId::from_array([0xff; 16]);
 
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index acec0ba44a..ffde08a73f 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -506,7 +506,6 @@ class SyntheticSizeVerifier:
 
 PER_METRIC_VERIFIERS = {
     "remote_storage_size": CannotVerifyAnything,
-    "resident_size": CannotVerifyAnything,
     "written_size": WrittenDataVerifier,
     "written_data_bytes_delta": WrittenDataDeltaVerifier,
     "timeline_logical_size": CannotVerifyAnything,

From b48404952d9658f24f88441b43fb4df7d222b159 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 30 Apr 2025 04:32:25 -0700
Subject: [PATCH 27/76] Bump vm-builder: v0.42.2 -> v0.46.0 (#11782)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumped to pick up the changes from neondatabase/autoscaling#1366 —
specifically including `uname` in the logs.

Other changes included:

* neondatabase/autoscaling#1301
* neondatabase/autoscaling#1296
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6c025ad2a9..18bec1b461 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -824,7 +824,7 @@ jobs:
           - pg: v17
             debian: bookworm
     env:
-      VM_BUILDER_VERSION: v0.42.2
+      VM_BUILDER_VERSION: v0.46.0
 
     steps:
       - name: Harden the runner (Audit all outbound calls)

From 8da4ec9740c8f292170babecfdc07148a60cd9f9 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 30 Apr 2025 13:01:41 +0100
Subject: [PATCH 28/76] Postgres metrics for stuck getpage requests (#11710)

https://github.com/neondatabase/neon/issues/10327
Resolves: #11720

New metrics:
- `compute_getpage_stuck_requests_total`
- `compute_getpage_max_inflight_stuck_time_ms`
---
 compute/etc/neon_collector.jsonnet                   |  2 ++
 ...pute_getpage_max_inflight_stuck_time_ms.libsonnet |  9 +++++++++
 .../compute_getpage_stuck_requests_total.libsonnet   |  9 +++++++++
 compute/etc/sql_exporter/neon_perf_counters.sql      |  2 ++
 pgxn/neon/libpagestore.c                             |  6 ++++++
 pgxn/neon/neon_perf_counters.c                       |  9 ++++++++-
 pgxn/neon/neon_perf_counters.h                       | 12 ++++++++++++
 7 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 449e1199d0..e64d907fe4 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -23,6 +23,8 @@
     import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
     import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
     import 'sql_exporter/getpage_sync_requests_total.libsonnet',
+    import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet',
+    import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet',
     import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
     import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
     import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet
new file mode 100644
index 0000000000..bc1100c832
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'compute_getpage_max_inflight_stuck_time_ms',
+  type: 'gauge',
+  help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for',
+  values: [
+    'compute_getpage_max_inflight_stuck_time_ms',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet
new file mode 100644
index 0000000000..5f72f43254
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'compute_getpage_stuck_requests_total',
+  type: 'counter',
+  help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout',
+  values: [
+    'compute_getpage_stuck_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql
index 4a36f3bf2f..39a9d03412 100644
--- a/compute/etc/sql_exporter/neon_perf_counters.sql
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
   getpage_wait_seconds_sum numeric,
   getpage_prefetch_requests_total numeric,
   getpage_sync_requests_total numeric,
+  compute_getpage_stuck_requests_total numeric,
+  compute_getpage_max_inflight_stuck_time_ms numeric,
   getpage_prefetch_misses_total numeric,
   getpage_prefetch_discards_total numeric,
   getpage_prefetches_buffered numeric,
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5287c12a84..e758841beb 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -877,6 +877,7 @@ retry:
 			int			port;
 			int			sndbuf;
 			int			recvbuf;
+			uint64*		max_wait;
 
 			get_local_port(PQsocket(pageserver_conn), &port);
 			get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf);
@@ -887,7 +888,10 @@ retry:
 						   shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf,
 				           pageserver_conn->inStart, pageserver_conn->inEnd);
 			shard->receive_last_log_time = now;
+			MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged;
 			shard->receive_logged = true;
+			max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms;
+			*max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start));
 		}
 
 		/*
@@ -910,6 +914,7 @@ retry:
 			get_local_port(PQsocket(pageserver_conn), &port);
 			neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)",
 					   INSTR_TIME_GET_DOUBLE(since_start), port);
+			MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
 			pageserver_disconnect(shard_no);
 			return -1;
 		}
@@ -933,6 +938,7 @@ retry:
 	INSTR_TIME_SET_ZERO(shard->receive_start_time);
 	INSTR_TIME_SET_ZERO(shard->receive_last_log_time);
 	shard->receive_logged = false;
+	MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
 
 	return ret;
 }
diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
index 05db187076..c77d99d636 100644
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram,
 static metric_t *
 neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 {
-#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10)
+#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12)
 	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
 	int			i = 0;
 
@@ -166,6 +166,8 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 
 	APPEND_METRIC(getpage_prefetch_requests_total);
 	APPEND_METRIC(getpage_sync_requests_total);
+	APPEND_METRIC(compute_getpage_stuck_requests_total);
+	APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms);
 	APPEND_METRIC(getpage_prefetch_misses_total);
 	APPEND_METRIC(getpage_prefetch_discards_total);
 	APPEND_METRIC(pageserver_requests_sent_total);
@@ -294,6 +296,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 		totals.file_cache_hits_total += counters->file_cache_hits_total;
 		histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
 		histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
+
+		totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total;
+		totals.compute_getpage_max_inflight_stuck_time_ms = Max(
+			totals.compute_getpage_max_inflight_stuck_time_ms,
+			counters->compute_getpage_max_inflight_stuck_time_ms);
 	}
 
 	metrics = neon_perf_counters_to_metrics(&totals);
diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h
index 5f5330bb69..10cf094d4a 100644
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -57,6 +57,18 @@ typedef struct
 	uint64		getpage_prefetch_requests_total;
 	uint64		getpage_sync_requests_total;
 
+	/* 
+	 * Total number of Getpage requests left without an answer for more than
+	 * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout
+	 */
+	uint64 compute_getpage_stuck_requests_total;
+
+	/* 
+	 * Longest waiting time for active stuck requests. If a stuck request gets a
+	 * response or disconnects, this metric is updated
+	 */
+	uint64 compute_getpage_max_inflight_stuck_time_ms;
+
 	/*
 	 * Total number of readahead misses; consisting of either prefetches that
 	 * don't satisfy the LSN bounds, or cases where no readahead was issued

From 60f63c076f9b6b362600d65e01f3f1f8a0f4a5dd Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 30 Apr 2025 15:23:20 +0300
Subject: [PATCH 29/76] Make safekeeper proto version 3 default (#11518)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We have been running compute <-> sk protocol version 3 for a while on
staging with no issues observed, and want to fully migrate to it
eventually.

## Summary of changes

Let's make v3 the default.

ref https://github.com/neondatabase/neon/issues/10326

---------

Co-authored-by: Arpad Müller <arpad@neon.tech>
---
 pgxn/neon/walproposer_pg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index a061639815..17582405db 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,7 +63,7 @@
 char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
-int			safekeeper_proto_version = 2;
+int			safekeeper_proto_version = 3;
 
 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -228,7 +228,7 @@ nwp_register_gucs(void)
 							"Version of compute <-> safekeeper protocol.",
 							"Used while migrating from 2 to 3.",
 							&safekeeper_proto_version,
-							2, 0, INT_MAX,
+							3, 0, INT_MAX,
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);

From 1d68577fbd3c08496dbe7dd716dd5562bc51ca7a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 30 Apr 2025 15:44:59 +0300
Subject: [PATCH 30/76] Check target slot state in prefetch_wait_for (#11779)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1745599814030679

Assume the following scenario: prefetch_wait_for is doing
`CHECK_FOR_INTERRUPTS` which tries to load prefetch responses.
In case of error is calls pageserver_disconnect which aborts all
in-flight requests. But such failure is not detected by
`prefetch_wait_for` which returns true. As a result
`communicator_read_at_lsnv` assumes that slot is received, but as far as
asserts are disables at prod, it is not actually checked.
Then it tries to interpret response and ... *SIGSEGV*

## Summary of changes

Check target slot state  in `prefetch_wait_for`.

Resolves https://github.com/neondatabase/cloud/issues/28258

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/communicator.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index 61bb3206e7..818a149499 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -687,8 +687,14 @@ prefetch_wait_for(uint64 ring_index)
 		END_PREFETCH_RECEIVE_WORK();
 		CHECK_FOR_INTERRUPTS();
 	}
-
-	return result;
+	if (result)
+	{
+		/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
+		PrefetchRequest *slot = GetPrfSlot(ring_index);
+		return slot->status == PRFS_RECEIVED;
+	}
+	return false;
+;
 }
 
 /*

From 6b4b8e0d8be55c7224cbf113fb1717179b53f0e9 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 30 Apr 2025 11:50:12 -0400
Subject: [PATCH 31/76] fix(pageserver): do not increase basebackup err counter
 when shutdown (#11778)

## Problem

We occasionally see basebackup errors alerts but there were no errors
logged. Looking at the code, the only codepath that will cause this is
shutting down.

## Summary of changes

Do not increase any counter (ok/err) when basebackup request gets
cancelled due to shutdowns.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/metrics.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9a6c3f2378..a68b6acca1 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2180,6 +2180,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> {
         // If you want to change categorize of a specific error, also change it in `log_query_error`.
         let metric = match res {
             Ok(_) => &self.parent.ok,
+            Err(QueryError::Shutdown) => {
+                // Do not observe ok/err for shutdown
+                return;
+            }
             Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
                 if is_expected_io_error(io_error) =>
             {

From e2db76b9be5fc0de8f953dc4ba9f039ce05cdd95 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 30 Apr 2025 12:04:00 -0400
Subject: [PATCH 32/76] feat(pageserver): ondemand download reason
 observability (#11780)

## Problem

Part of https://github.com/neondatabase/neon/issues/11615

## Summary of changes

We don't understand the root cause of why we get resident size surge
every now and then. This patch adds observability for that, and in the
next week, we might have a better understanding of what's going on.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/metrics.rs                    | 18 ++++++++++++++++++
 pageserver/src/tenant/storage_layer/layer.rs |  9 +++++++++
 2 files changed, 27 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index a68b6acca1..8e4dbd6c3e 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_ondemand_download_bytes_total",
+        "Total bytes of layers on-demand downloaded",
+        &["task_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_ondemand_download_count",
+        "Total count of layers on-demand downloaded",
+        &["task_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod wait_ondemand_download_time {
     use super::*;
     const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b7f6e5dc77..50810cb154 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,6 +4,7 @@ use std::sync::{Arc, Weak};
 use std::time::{Duration, SystemTime};
 
 use crate::PERF_TRACE_TARGET;
+use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT};
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
@@ -1255,6 +1256,14 @@ impl LayerInner {
 
                 self.access_stats.record_residence_event();
 
+                let task_kind: &'static str = ctx.task_kind().into();
+                ONDEMAND_DOWNLOAD_BYTES
+                    .with_label_values(&[task_kind])
+                    .inc_by(self.desc.file_size);
+                ONDEMAND_DOWNLOAD_COUNT
+                    .with_label_values(&[task_kind])
+                    .inc();
+
                 Ok(self.initialize_after_layer_is_on_disk(permit))
             }
             Err(e) => {

From bec7427d9e4d84b6bcb6a74338cfb711e1748e8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 30 Apr 2025 18:24:01 +0200
Subject: [PATCH 33/76] pull_timeline and sk logging fixes (#11786)

This patch contains some fixes of issues I ran into for #11712:

* make `pull_timeline` return success for timeline that already exists.
This follows general API design of storage components: API endpoints are
retryable and converge to a status code, instead of starting to error.
We change the `pull_timeline`'s return type a little bit, because we
might not actually have a source sk to pull from. Note that the fix is
not enough, there is still a race when two `pull_timeline` instances
happen in parallel: we might try to enter both pulled timelines at the
same time. That can be fixed later.
* make `pull_timeline` support one safekeeper being down. In general, if
one safekeeper is down, that's not a problem. the added comment explains
a potential situation (found in the `test_lagging_sk` test for example)
* don't log very long errors when computes try to connect to safekeepers
that don't have the timeline yet, if `allow_timeline_creation` is false.
That flag is enabled when a sk connection string with generation numbers
is passed to the compute, so we'll hit this code path more often. E.g.
when a safekeeper missed a timeline creation, but the compute connects
to it first before the `pull_timeline` gets requested by the storcon
reconciler: this is a perfectly normal situation. So don't log the whole
error backtrace, and don't log it on the error log level, but only on
info.

part of #11670
---
 libs/postgres_backend/src/lib.rs              |  6 ++++
 libs/safekeeper_api/src/models.rs             |  5 ++--
 safekeeper/src/pull_timeline.rs               | 28 ++++++++++++++++---
 safekeeper/src/receive_wal.rs                 | 13 ++++++---
 .../src/service/safekeeper_reconciler.rs      |  9 +++---
 5 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 654dde8da6..714d8ac403 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -841,6 +841,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
 
         let expected_end = match &end {
             ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
+            // The timeline doesn't exist and we have been requested to not auto-create it.
+            // Compute requests for timelines that haven't been created yet
+            // might reach us before the storcon request to create those timelines.
+            TimelineNoCreate => true,
             CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
                 if is_expected_io_error(io_error) =>
             {
@@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd {
     Terminate,
     #[error("EOF on COPY stream")]
     EOF,
+    #[error("timeline not found, and allow_timeline_creation is false")]
+    TimelineNoCreate,
     /// The connection was lost
     #[error("connection error: {0}")]
     Disconnected(#[from] ConnectionError),
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 51f88625da..cc31b38fe7 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -303,7 +303,8 @@ pub struct PullTimelineRequest {
 
 #[derive(Debug, Serialize, Deserialize)]
 pub struct PullTimelineResponse {
-    // Donor safekeeper host
-    pub safekeeper_host: String,
+    /// Donor safekeeper host.
+    /// None if no pull happened because the timeline already exists.
+    pub safekeeper_host: Option<String>,
     // TODO: add more fields?
 }
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 653b084ad8..1510a51019 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -401,7 +401,10 @@ pub async fn handle_request(
         request.timeline_id,
     ));
     if existing_tli.is_ok() {
-        bail!("Timeline {} already exists", request.timeline_id);
+        info!("Timeline {} already exists", request.timeline_id);
+        return Ok(PullTimelineResponse {
+            safekeeper_host: None,
+        });
     }
 
     let mut http_client = reqwest::Client::builder();
@@ -425,8 +428,25 @@ pub async fn handle_request(
 
     let mut statuses = Vec::new();
     for (i, response) in responses.into_iter().enumerate() {
-        let status = response.context(format!("fetching status from {}", http_hosts[i]))?;
-        statuses.push((status, i));
+        match response {
+            Ok(status) => {
+                statuses.push((status, i));
+            }
+            Err(e) => {
+                info!("error fetching status from {}: {e}", http_hosts[i]);
+            }
+        }
+    }
+
+    // Allow missing responses from up to one safekeeper (say due to downtime)
+    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
+    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
+    let min_required_successful = (http_hosts.len() - 1).max(1);
+    if statuses.len() < min_required_successful {
+        bail!(
+            "only got {} successful status responses. required: {min_required_successful}",
+            statuses.len()
+        )
     }
 
     // Find the most advanced safekeeper
@@ -536,6 +556,6 @@ async fn pull_timeline(
         .await?;
 
     Ok(PullTimelineResponse {
-        safekeeper_host: host,
+        safekeeper_host: Some(host),
     })
 }
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 9975153f6c..eb8eee6ab8 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -32,7 +32,7 @@ use crate::metrics::{
     WAL_RECEIVERS,
 };
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage};
-use crate::timeline::WalResidentTimeline;
+use crate::timeline::{TimelineError, WalResidentTimeline};
 
 const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 
@@ -357,9 +357,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
                         .await
                         .context("create timeline")?
                 } else {
-                    self.global_timelines
-                        .get(self.ttid)
-                        .context("get timeline")?
+                    let timeline_res = self.global_timelines.get(self.ttid);
+                    match timeline_res {
+                        Ok(tl) => tl,
+                        Err(TimelineError::NotFound(_)) => {
+                            return Err(CopyStreamHandlerEnd::TimelineNoCreate);
+                        }
+                        other => other.context("get_timeline")?,
+                    }
                 };
                 tli.wal_residence_guard().await?
             }
diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs
index 74308cabff..71c73a0112 100644
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -306,10 +306,11 @@ impl SafekeeperReconcilerInner {
                     req,
                     async |client| client.pull_timeline(&pull_req).await,
                     |resp| {
-                        tracing::info!(
-                            "pulled timeline from {} onto {req_host}",
-                            resp.safekeeper_host,
-                        );
+                        if let Some(host) = resp.safekeeper_host {
+                            tracing::info!("pulled timeline from {host} onto {req_host}");
+                        } else {
+                            tracing::info!("timeline already present on safekeeper on {req_host}");
+                        }
                     },
                     req_cancel,
                 )

From 1b789e8d7c917898d4069eda8c999f96c5ac0eeb Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Wed, 30 Apr 2025 20:50:21 +0400
Subject: [PATCH 34/76] fix(pgxn/neon): Use proper member size in
 TermsCollectedMset and VotesCollectedMset  (#11785)

## Problem
`TermsCollectedMset` and `VotesCollectedMset` accept a MemberSet
argument to find a quorum in. It may be either `wp->mconf.members` or
`wp->mconf.new_members`. But the loops inside always use
`wp->mconf.members.len`.

If the sizes of member sets are different, it may lead to these
functions not scanning all the safekeepers from `mset`.

We are not planning to change the member set size dynamically now, but
it's worth fixing anyway.

- Part of https://github.com/neondatabase/neon/issues/11669

## Summary of changes
- Use proper size of member set in `TermsCollectedMset` and
`VotesCollectedMset`
---
 pgxn/neon/walproposer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index b95b1451e4..f4f1398375 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -836,7 +836,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
 {
 	uint32		n_greeted = 0;
 
-	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	for (uint32 i = 0; i < mset->len; i++)
 	{
 		Safekeeper *sk = msk[i];
 
@@ -1106,7 +1106,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
 {
 	uint32		n_votes = 0;
 
-	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	for (uint32 i = 0; i < mset->len; i++)
 	{
 		Safekeeper *sk = msk[i];
 

From 5bd850d15a3aa1034c580521b15b16ab71d961a0 Mon Sep 17 00:00:00 2001
From: Shockingly Good <vityatheboss@gmail.com>
Date: Thu, 1 May 2025 11:09:10 +0200
Subject: [PATCH 35/76] Fix the leaked tracing context for the
 "compute_monitor:run". (#11791)

Removes the leaked tracing context for the "compute_monitor:run" log,
which either inherited the "start_compute" span or also the HTTP request
context.

## Problem

The problem is that the context of the monitor's trace is unnecessarily
populated with the span data inherited from previously within the same
thread.

## Summary of changes

The context is completely reset by moving the span from the thread
spawning the monitor into the thread where the monitor will actually
start working.

Addresses https://github.com/neondatabase/cloud/issues/28145

## Examples

### Before
```
2025-04-30T16:39:05.840298Z  INFO start_compute:compute_monitor:run: compute is not running, waiting before monitoring activity
```

### After

```
2025-04-30T16:39:05.840298Z  INFO compute_monitor:run: compute is not running, waiting before monitoring activity
```
---
 compute_tools/src/monitor.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 5a07eec833..3311ee47b3 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
         experimental,
     };
 
-    let span = span!(Level::INFO, "compute_monitor");
     thread::Builder::new()
         .name("compute-monitor".into())
         .spawn(move || {
+            let span = span!(Level::INFO, "compute_monitor");
             let _enter = span.enter();
             monitor.run();
         })

From f999632327f108fc497d4c3f467cdf4349f4027a Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Thu, 1 May 2025 11:22:01 -0400
Subject: [PATCH 36/76] Adding `anon` v2 support to the dockerfile (#11313)

## Problem

Removed `anon` v1 support as described here:
https://github.com/neondatabase/cloud/issues/22663

Adding `anon` v2 support to re-introduce the `pg_anon` extension.
Related Issues: https://github.com/neondatabase/cloud/issues/20456


## Summary of changes

Adding `anon` v2 support by building it in the dockerfile
---
 compute/compute-node.Dockerfile |  51 +++++++++++++
 compute/patches/anon_v2.patch   | 129 ++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+)
 create mode 100644 compute/patches/anon_v2.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 267940a405..cc338cec6a 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1096,6 +1096,23 @@ RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
+#########################################################################################
+#
+# Layer "rust extensions pgrx14"
+#
+# Version 14 is now required by a few 
+# This layer should be used as a base for new pgrx extensions,
+# and eventually get merged with `rust-extensions-build`
+#
+#########################################################################################
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
+ARG PG_VERSION
+
+RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
+
 #########################################################################################
 #
 # Layers "pg-onnx-build" and "pgrag-build"
@@ -1330,6 +1347,39 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
 WORKDIR /ext-src/pg_session_jwt-src
 RUN cargo pgrx install --release
 
+#########################################################################################
+#
+# Layer "pg-anon-pg-build"
+# compile anon extension
+#
+#########################################################################################
+FROM pg-build AS pg_anon-src
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+WORKDIR /ext-src
+COPY compute/patches/anon_v2.patch .
+
+# This is an experimental extension, never got to real production.
+# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
+RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    patch -p1 < /ext-src/anon_v2.patch
+
+FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build
+ARG PG_VERSION
+COPY --from=pg_anon-src /ext-src/ /ext-src/
+WORKDIR /ext-src
+RUN cd pg_anon-src && \
+    make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
+    chmod -R a+r ../pg_anon-src && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control;
+
+########################################################################################
+
 #########################################################################################
 #
 # Layer "wal2json-build"
@@ -1626,6 +1676,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
+COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch
new file mode 100644
index 0000000000..e833a6dfd3
--- /dev/null
+++ b/compute/patches/anon_v2.patch
@@ -0,0 +1,129 @@
+diff --git a/sql/anon.sql b/sql/anon.sql
+index 0cdc769..f6cc950 100644
+--- a/sql/anon.sql
++++ b/sql/anon.sql
+@@ -1141,3 +1141,8 @@ $$
+ -- TODO : https://en.wikipedia.org/wiki/L-diversity
+ 
+ -- TODO : https://en.wikipedia.org/wiki/T-closeness
++
++-- NEON Patches
++
++GRANT ALL ON SCHEMA anon to neon_superuser;
++GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
+diff --git a/sql/init.sql b/sql/init.sql
+index 7da6553..9b6164b 100644
+--- a/sql/init.sql
++++ b/sql/init.sql
+@@ -74,50 +74,49 @@ $$
+ 
+ SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED';
+ 
+--- load fake data from a given path
+-CREATE OR REPLACE FUNCTION anon.init(
+-  datapath TEXT
+-)
++CREATE OR REPLACE FUNCTION anon.load_fake_data()
+ RETURNS BOOLEAN
+ AS $$
+ DECLARE
+-  datapath_check TEXT;
+   success BOOLEAN;
++  sharedir TEXT;
++  datapath TEXT;
+ BEGIN
+ 
+-  IF anon.is_initialized() THEN
+-    RAISE NOTICE 'The anon extension is already initialized.';
+-    RETURN TRUE;
+-  END IF;
++  datapath := '/extension/anon/';
++  -- find the local extension directory
++  SELECT setting INTO sharedir
++  FROM pg_catalog.pg_config
++  WHERE name = 'SHAREDIR';
+ 
+   SELECT bool_or(results) INTO success
+   FROM unnest(array[
+-    anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'),
+-    anon.load_csv('anon.identifier',datapath ||'/identifier.csv'),
+-    anon.load_csv('anon.address',datapath ||'/address.csv'),
+-    anon.load_csv('anon.city',datapath ||'/city.csv'),
+-    anon.load_csv('anon.company',datapath ||'/company.csv'),
+-    anon.load_csv('anon.country',datapath ||'/country.csv'),
+-    anon.load_csv('anon.email', datapath ||'/email.csv'),
+-    anon.load_csv('anon.first_name',datapath ||'/first_name.csv'),
+-    anon.load_csv('anon.iban',datapath ||'/iban.csv'),
+-    anon.load_csv('anon.last_name',datapath ||'/last_name.csv'),
+-    anon.load_csv('anon.postcode',datapath ||'/postcode.csv'),
+-    anon.load_csv('anon.siret',datapath ||'/siret.csv'),
+-    anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv')
++    anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'),
++    anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'),
++    anon.load_csv('anon.address',sharedir || datapath || '/address.csv'),
++    anon.load_csv('anon.city',sharedir || datapath || '/city.csv'),
++    anon.load_csv('anon.company',sharedir || datapath || '/company.csv'),
++    anon.load_csv('anon.country',sharedir || datapath || '/country.csv'),
++    anon.load_csv('anon.email', sharedir || datapath || '/email.csv'),
++    anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'),
++    anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'),
++    anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'),
++    anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'),
++    anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'),
++    anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv')
+   ]) results;
+   RETURN success;
+-
+ END;
+ $$
+-  LANGUAGE PLPGSQL
++  LANGUAGE plpgsql
+   VOLATILE
+   RETURNS NULL ON NULL INPUT
+-  PARALLEL UNSAFE -- because load_csv is unsafe
+-  SECURITY INVOKER
++  PARALLEL UNSAFE -- because of the EXCEPTION
++  SECURITY DEFINER
+   SET search_path=''
+ ;
+-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED';
++
++SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED';
+ 
+ -- People tend to forget the anon.init() step
+ -- This is a friendly notice for them
+@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED';
+ CREATE OR REPLACE FUNCTION anon.load(TEXT)
+ RETURNS BOOLEAN AS
+ $$
+-  SELECT anon.init($1);
++  SELECT anon.init();
+ $$
+   LANGUAGE SQL
+   VOLATILE
+@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED';
+ CREATE OR REPLACE FUNCTION anon.init()
+ RETURNS BOOLEAN
+ AS $$
+-  WITH conf AS (
+-        -- find the local extension directory
+-        SELECT setting AS sharedir
+-        FROM pg_catalog.pg_config
+-        WHERE name = 'SHAREDIR'
+-    )
+-  SELECT anon.init(conf.sharedir || '/extension/anon/')
+-  FROM conf;
++BEGIN
++  IF anon.is_initialized() THEN
++    RAISE NOTICE 'The anon extension is already initialized.';
++    RETURN TRUE;
++  END IF;
++
++  RETURN anon.load_fake_data();
++END;
+ $$
+-  LANGUAGE SQL
++  LANGUAGE plpgsql
+   VOLATILE
+   PARALLEL UNSAFE -- because init is unsafe
+   SECURITY INVOKER

From 16d594b7b37733fd56d3cc4767b119f1dd391fb7 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 1 May 2025 16:56:43 +0100
Subject: [PATCH 37/76] pagectl: list layers for given key in decreasing LSN
 order (#11799)

Adds an extra key CLI arg to `pagectl layer list-layer`. When provided,
only layers with key ranges containing the key will be listed in
decreasing LSN order (indices are preserved for `dump-layer`).
---
 pageserver/ctl/src/layers.rs | 37 +++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index 293c01eff0..79f56a5a51 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
+use pageserver_api::key::Key;
 use utils::id::{TenantId, TimelineId};
 
 use crate::layer_map_analyzer::parse_filename;
@@ -27,6 +28,7 @@ pub(crate) enum LayerCmd {
         path: PathBuf,
         tenant: String,
         timeline: String,
+        key: Option<Key>,
     },
     /// Dump all information of a layer file
     DumpLayer {
@@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             path,
             tenant,
             timeline,
+            key,
         } => {
             let timeline_path = path
                 .join(TENANTS_SEGMENT_NAME)
@@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                 .join(TIMELINES_SEGMENT_NAME)
                 .join(timeline);
             let mut idx = 0;
+            let mut to_print = Vec::default();
             for layer in fs::read_dir(timeline_path)? {
                 let layer = layer?;
                 if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
-                    println!(
-                        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
-                        idx,
-                        layer_file.key_range.start,
-                        layer_file.key_range.end,
-                        layer_file.lsn_range.start,
-                        layer_file.lsn_range.end,
-                        layer_file.is_delta,
-                    );
+                    if let Some(key) = key {
+                        if layer_file.key_range.start <= *key && *key < layer_file.key_range.end {
+                            to_print.push((idx, layer_file));
+                        }
+                    } else {
+                        to_print.push((idx, layer_file));
+                    }
                     idx += 1;
                 }
             }
+
+            if key.is_some() {
+                to_print
+                    .sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end));
+            }
+
+            for (idx, layer_file) in to_print {
+                println!(
+                    "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                    idx,
+                    layer_file.key_range.start,
+                    layer_file.key_range.end,
+                    layer_file.lsn_range.start,
+                    layer_file.lsn_range.end,
+                    layer_file.is_delta,
+                );
+            }
             Ok(())
         }
         LayerCmd::DumpLayer {

From ae2c3ac12ff1b21f92a0d81b8988ff255c3dd8cf Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 1 May 2025 13:51:10 -0400
Subject: [PATCH 38/76] test: revert relsizev2 config (#11759)

## Problem

part of https://github.com/neondatabase/neon/issues/9516

One thing I realized in the past few months is that "no-way-back" things
like this are scary to roll out without a fine-grained rollout infra.
The plan was to flip the flag in the repo and roll it out soon, but I
don't think rolling out would happen in the near future. So I'd rather
revert the flag to avoid creating a discrepancy between staging and the
regress tests.

## Summary of changes

Not using rel_size_v2 by default in unit tests; we still have a few
tests to explicitly test the new format so we still get some test
coverages.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py            | 3 ++-
 test_runner/regress/test_attach_tenant_config.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b93df4ede4..47d1228c61 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1279,7 +1279,8 @@ class NeonEnv:
                 )
 
             tenant_config = ps_cfg.setdefault("tenant_config", {})
-            tenant_config["rel_size_v2_enabled"] = True  # Enable relsize_v2 by default in tests
+            # This feature is pending rollout.
+            # tenant_config["rel_size_v2_enabled"] = True
 
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index ee408e3c65..3616467c00 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -186,7 +186,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "type": "interpreted",
             "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
         },
-        "rel_size_v2_enabled": False,  # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it
+        "rel_size_v2_enabled": True,
         "gc_compaction_enabled": True,
         "gc_compaction_verification": False,
         "gc_compaction_initial_threshold_kb": 1024000,

From bbc35e10b877f7bbb260595c90375f4e5a3bfbdb Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 1 May 2025 14:36:26 -0400
Subject: [PATCH 39/76] fix(test): increase timeouts for some tests (#11781)

## Problem

Those tests are timing out more frequently after
https://github.com/neondatabase/neon/pull/11585

## Summary of changes

Increase timeout for `test_pageserver_gc_compaction_smoke`

Increase rollback wait timeout for `test_tx_abort_with_many_relations`

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 2 +-
 test_runner/regress/test_pg_regress.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 53edf9f79e..0dfc665a1d 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -229,7 +229,7 @@ def test_pageserver_gc_compaction_preempt(
 
 
 @skip_in_debug_build("only run with release build")
-@pytest.mark.timeout(600)  # This test is slow with sanitizers enabled, especially on ARM
+@pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
 @pytest.mark.parametrize(
     "with_branches",
     ["with_branches", "no_branches"],
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 0fea706888..474002353b 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -471,7 +471,7 @@ def test_tx_abort_with_many_relations(
         try:
             # Rollback phase should be fast: this is one WAL record that we should process efficiently
             fut = exec.submit(rollback_and_wait)
-            fut.result(timeout=15)
+            fut.result(timeout=15 if reldir_type == "v1" else 30)
         except:
             exec.shutdown(wait=False, cancel_futures=True)
             raise

From 22290eb7ba14b1bd6efd0f6999f6b292684207a0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 2 May 2025 13:46:21 +0100
Subject: [PATCH 40/76] CI: notify relevant team about release deploy failures
 (#11797)

## Problem

We notify only Storage team about failed deploys, but Compute and Proxy
teams can also benefit from that

## Summary of changes
- Adjust `notify-storage-release-deploy-failure` to notify the relevant
team about failed deploy
---
 .github/actionlint.yml               |  5 ++++
 .github/workflows/build_and_test.yml | 37 +++++++++++++++++++++++-----
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 1d1b50e458..b7e0be761a 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -33,9 +33,14 @@ config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_CICD_CHANNEL_ID
+  - SLACK_COMPUTE_CHANNEL_ID
   - SLACK_ON_CALL_DEVPROD_STREAM
   - SLACK_ON_CALL_QA_STAGING_STREAM
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_ONCALL_COMPUTE_GROUP
+  - SLACK_ONCALL_PROXY_GROUP
+  - SLACK_ONCALL_STORAGE_GROUP
+  - SLACK_PROXY_CHANNEL_ID
   - SLACK_RUST_CHANNEL_ID
   - SLACK_STORAGE_CHANNEL_ID
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 18bec1b461..e0995218f9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1434,10 +1434,10 @@ jobs:
             ;;
           esac
 
-  notify-storage-release-deploy-failure:
-    needs: [ deploy ]
+  notify-release-deploy-failure:
+    needs: [ meta, deploy ]
     # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
-    if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
+    if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always()
     runs-on: ubuntu-22.04
     steps:
       - name: Harden the runner (Audit all outbound calls)
@@ -1445,15 +1445,40 @@ jobs:
         with:
           egress-policy: audit
 
-      - name: Post release-deploy failure to team-storage slack channel
+      - name: Post release-deploy failure to team slack channel
         uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        env:
+          TEAM_ONCALL: >-
+            ${{
+              fromJSON(format('{
+                "storage-release": "<!subteam^{0}|@oncall-storage>",
+                "compute-release": "<!subteam^{1}|@oncall-compute>",
+                "proxy-release":   "<!subteam^{2}|@oncall-proxy>"
+              }',
+                vars.SLACK_ONCALL_STORAGE_GROUP,
+                vars.SLACK_ONCALL_COMPUTE_GROUP,
+                vars.SLACK_ONCALL_PROXY_GROUP
+              ))[needs.meta.outputs.run-kind]
+            }}
+          CHANNEL: >-
+            ${{
+              fromJSON(format('{
+                "storage-release": "{0}",
+                "compute-release": "{1}",
+                "proxy-release":   "{2}"
+              }',
+                vars.SLACK_STORAGE_CHANNEL_ID,
+                vars.SLACK_COMPUTE_CHANNEL_ID,
+                vars.SLACK_PROXY_CHANNEL_ID
+              ))[needs.meta.outputs.run-kind]
+            }}
         with:
           method: chat.postMessage
           token: ${{ secrets.SLACK_BOT_TOKEN }}
           payload: |
-            channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
+            channel: ${{ env.CHANNEL }}
             text: |
-              🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
+              🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
 
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:

From 79699aebc8ac886afea2ec45320d7129232007cb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 2 May 2025 17:36:10 +0300
Subject: [PATCH 41/76] Reserve in file descriptor pool sockets used for
 connections to page servers (#11798)

## Problem

See https://github.com/neondatabase/neon/issues/11790

The neon extension opens extensions to the pageservers, which consumes
file descriptors. Postgres has a mechanism to count how many FDs are in
use, but it doesn't know about those FDs. We should call
ReserveExternalFD() or AcquireExternalFD() to account for them.

## Summary of changes

Call `ReserveExternalFD()` for each shard

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Mikhail Kot <mikhail@neon.tech>
---
 pgxn/neon/libpagestore.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index e758841beb..ee4e6ccc5b 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -26,6 +26,7 @@
 #include "portability/instr_time.h"
 #include "postmaster/interrupt.h"
 #include "storage/buf_internals.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
@@ -79,6 +80,7 @@ int         neon_protocol_version = 3;
 static int	neon_compute_mode = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
+static int	max_sockets;
 
 static int pageserver_response_log_timeout = 10000;
 /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */
@@ -336,6 +338,13 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 				pageserver_disconnect(i);
 		}
 		pagestore_local_counter = end_update_counter;
+
+        /* Reserve file descriptors for sockets */
+		while (max_sockets < num_shards)
+		{
+			max_sockets += 1;
+			ReserveExternalFD();
+		}
 	}
 
 	if (num_shards_p)

From 4b9087651c2ef0d7888d76e8786066d00381388f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 2 May 2025 22:27:59 +0300
Subject: [PATCH 42/76] Checked that stored LwLSN >= FirstNormalUnloggedLSN
 (#11750)

## Problem

Undo unintended change 60b9fb1baf4cba732cff4792b9a97d755794b7e2

## Summary of changes

Add assert that we are not storing fake LSN in LwLSN.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon_lwlsncache.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c
index 6959da55cb..a8cfa0f825 100644
--- a/pgxn/neon/neon_lwlsncache.c
+++ b/pgxn/neon/neon_lwlsncache.c
@@ -4,6 +4,7 @@
 
 #include "miscadmin.h"
 #include "access/xlog.h"
+#include "access/xlog_internal.h"
 #include "storage/ipc.h"
 #include "storage/shmem.h"
 #include "storage/buf_internals.h"
@@ -396,9 +397,10 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn,
 XLogRecPtr
 neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
 {
-	if (lsn < FirstNormalUnloggedLSN || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
+	if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
 		return lsn;
 
+	Assert(lsn >= WalSegMinSize);
 	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
 	lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks);
 	LWLockRelease(LastWrittenLsnLock);
@@ -435,7 +437,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
 		NInfoGetRelNumber(relfilenode) == InvalidOid)
 		return InvalidXLogRecPtr;
 
-	
 	BufTagInit(key,  relNumber, forknum, blockno, spcOid, dbOid);
 
 	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
@@ -444,6 +445,10 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
 	{
 		XLogRecPtr	lsn = lsns[i];
 
+		if (lsn == InvalidXLogRecPtr)
+			continue;
+
+		Assert(lsn >= WalSegMinSize);
 		key.blockNum = blockno + i;
 		entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
 		if (found)

From 6131d86ec972f08febecc7b29a4273e84a408905 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 5 May 2025 12:18:55 +0100
Subject: [PATCH 43/76] proxy: allow invalid SNI (#11792)

## Problem

Some PrivateLink customers are unable to use Private DNS. As such they
use an invalid domain name to address Neon. We currently are rejecting
those connections because we cannot resolve the correct certificate.

## Summary of changes

1. Ensure a certificate is always returned.
2. If there is an SNI field, use endpoint fallback if it doesn't match.

I suggest reviewing each commit separately.
---
 proxy/src/auth/credentials.rs         |  65 ++++-----
 proxy/src/proxy/handshake.rs          |  15 +--
 proxy/src/proxy/tests/mod.rs          |   3 +-
 proxy/src/serverless/sql_over_http.rs |   3 +-
 proxy/src/tls/server_config.rs        | 187 ++++++++++++++------------
 5 files changed, 138 insertions(+), 135 deletions(-)

diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index c55af325e3..183976374a 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -32,12 +32,6 @@ pub(crate) enum ComputeUserInfoParseError {
         option: EndpointId,
     },
 
-    #[error(
-        "Common name inferred from SNI ('{}') is not known",
-        .cn,
-    )]
-    UnknownCommonName { cn: String },
-
     #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
     MalformedProjectName(EndpointId),
 }
@@ -66,22 +60,15 @@ impl ComputeUserInfoMaybeEndpoint {
     }
 }
 
-pub(crate) fn endpoint_sni(
-    sni: &str,
-    common_names: &HashSet<String>,
-) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
-    let Some((subdomain, common_name)) = sni.split_once('.') else {
-        return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
-    };
+pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet<String>) -> Option<EndpointId> {
+    let (subdomain, common_name) = sni.split_once('.')?;
     if !common_names.contains(common_name) {
-        return Err(ComputeUserInfoParseError::UnknownCommonName {
-            cn: common_name.into(),
-        });
+        return None;
     }
     if subdomain == SERVERLESS_DRIVER_SNI {
-        return Ok(None);
+        return None;
     }
-    Ok(Some(EndpointId::from(subdomain)))
+    Some(EndpointId::from(subdomain))
 }
 
 impl ComputeUserInfoMaybeEndpoint {
@@ -113,15 +100,8 @@ impl ComputeUserInfoMaybeEndpoint {
             })
             .map(|name| name.into());
 
-        let endpoint_from_domain = if let Some(sni_str) = sni {
-            if let Some(cn) = common_names {
-                endpoint_sni(sni_str, cn)?
-            } else {
-                None
-            }
-        } else {
-            None
-        };
+        let endpoint_from_domain =
+            sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn)));
 
         let endpoint = match (endpoint_option, endpoint_from_domain) {
             // Invariant: if we have both project name variants, they should match.
@@ -424,21 +404,34 @@ mod tests {
     }
 
     #[test]
-    fn parse_inconsistent_sni() {
+    fn parse_unknown_sni() {
         let options = StartupMessageParams::new([("user", "john_doe")]);
 
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
         let ctx = RequestContext::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
-        match err {
-            UnknownCommonName { cn } => {
-                assert_eq!(cn, "localhost");
-            }
-            _ => panic!("bad error: {err:?}"),
-        }
+        let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .unwrap();
+
+        assert!(info.endpoint_id.is_none());
+    }
+
+    #[test]
+    fn parse_unknown_sni_with_options() {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "endpoint=foo-bar-baz-1234"),
+        ]);
+
+        let sni = Some("project.localhost");
+        let common_names = Some(["example.com".into()].into());
+
+        let ctx = RequestContext::test();
+        let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .unwrap();
+
+        assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234"));
     }
 
     #[test]
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index c05031ad97..54c02f2c15 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -24,9 +24,6 @@ pub(crate) enum HandshakeError {
     #[error("protocol violation")]
     ProtocolViolation,
 
-    #[error("missing certificate")]
-    MissingCertificate,
-
     #[error("{0}")]
     StreamUpgradeError(#[from] StreamUpgradeError),
 
@@ -42,10 +39,6 @@ impl ReportableError for HandshakeError {
         match self {
             HandshakeError::EarlyData => crate::error::ErrorKind::User,
             HandshakeError::ProtocolViolation => crate::error::ErrorKind::User,
-            // This error should not happen, but will if we have no default certificate and
-            // the client sends no SNI extension.
-            // If they provide SNI then we can be sure there is a certificate that matches.
-            HandshakeError::MissingCertificate => crate::error::ErrorKind::Service,
             HandshakeError::StreamUpgradeError(upgrade) => match upgrade {
                 StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service,
                 StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
@@ -146,7 +139,7 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         // try parse endpoint
                         let ep = conn_info
                             .server_name()
-                            .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
+                            .and_then(|sni| endpoint_sni(sni, &tls.common_names));
                         if let Some(ep) = ep {
                             ctx.set_endpoint_id(ep);
                         }
@@ -161,10 +154,8 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                             }
                         }
 
-                        let (_, tls_server_end_point) = tls
-                            .cert_resolver
-                            .resolve(conn_info.server_name())
-                            .ok_or(HandshakeError::MissingCertificate)?;
+                        let (_, tls_server_end_point) =
+                            tls.cert_resolver.resolve(conn_info.server_name());
 
                         stream = PqStream {
                             framed: Framed {
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 9a6864c33e..f47636cd71 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -98,8 +98,7 @@ fn generate_tls_config<'a>(
                 .with_no_client_auth()
                 .with_single_cert(vec![cert.clone()], key.clone_key())?;
 
-        let mut cert_resolver = CertResolver::new();
-        cert_resolver.add_cert(key, vec![cert], true)?;
+        let cert_resolver = CertResolver::new(key, vec![cert])?;
 
         let common_names = cert_resolver.get_common_names();
 
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7fb39553f9..fee5942b7e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -199,8 +199,7 @@ fn get_conn_info(
     let endpoint = match connection_url.host() {
         Some(url::Host::Domain(hostname)) => {
             if let Some(tls) = tls {
-                endpoint_sni(hostname, &tls.common_names)?
-                    .ok_or(ConnInfoError::MalformedEndpoint)?
+                endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)?
             } else {
                 hostname
                     .split_once('.')
diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs
index 5a95e69fde..8f8917ef62 100644
--- a/proxy/src/tls/server_config.rs
+++ b/proxy/src/tls/server_config.rs
@@ -5,6 +5,7 @@ use anyhow::{Context, bail};
 use itertools::Itertools;
 use rustls::crypto::ring::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
+use rustls::sign::CertifiedKey;
 use x509_cert::der::{Reader, SliceReader};
 
 use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint};
@@ -25,10 +26,8 @@ pub fn configure_tls(
     certs_dir: Option<&String>,
     allow_tls_keylogfile: bool,
 ) -> anyhow::Result<TlsConfig> {
-    let mut cert_resolver = CertResolver::new();
-
     // add default certificate
-    cert_resolver.add_cert_path(key_path, cert_path, true)?;
+    let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?;
 
     // add extra certificates
     if let Some(certs_dir) = certs_dir {
@@ -40,11 +39,8 @@ pub fn configure_tls(
                 let key_path = path.join("tls.key");
                 let cert_path = path.join("tls.crt");
                 if key_path.exists() && cert_path.exists() {
-                    cert_resolver.add_cert_path(
-                        &key_path.to_string_lossy(),
-                        &cert_path.to_string_lossy(),
-                        false,
-                    )?;
+                    cert_resolver
+                        .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
                 }
             }
         }
@@ -83,92 +79,42 @@ pub fn configure_tls(
     })
 }
 
-#[derive(Default, Debug)]
+#[derive(Debug)]
 pub struct CertResolver {
     certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
-    default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
+    default: (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint),
 }
 
 impl CertResolver {
-    pub fn new() -> Self {
-        Self::default()
+    fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result<Self> {
+        let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
+        Self::new(priv_key, cert_chain)
     }
 
-    fn add_cert_path(
-        &mut self,
-        key_path: &str,
-        cert_path: &str,
-        is_default: bool,
-    ) -> anyhow::Result<()> {
-        let priv_key = {
-            let key_bytes = std::fs::read(key_path)
-                .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
-            rustls_pemfile::private_key(&mut &key_bytes[..])
-                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
-                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
-        };
+    pub fn new(
+        priv_key: PrivateKeyDer<'static>,
+        cert_chain: Vec<CertificateDer<'static>>,
+    ) -> anyhow::Result<Self> {
+        let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?;
 
-        let cert_chain_bytes = std::fs::read(cert_path)
-            .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
-
-        let cert_chain = {
-            rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                .try_collect()
-                .with_context(|| {
-                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
-                })?
-        };
-
-        self.add_cert(priv_key, cert_chain, is_default)
+        let mut certs = HashMap::new();
+        let default = (cert.clone(), tls_server_end_point);
+        certs.insert(common_name, (cert, tls_server_end_point));
+        Ok(Self { certs, default })
     }
 
-    pub fn add_cert(
+    fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
+        let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
+        self.add_cert(priv_key, cert_chain)
+    }
+
+    fn add_cert(
         &mut self,
         priv_key: PrivateKeyDer<'static>,
         cert_chain: Vec<CertificateDer<'static>>,
-        is_default: bool,
     ) -> anyhow::Result<()> {
-        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
-
-        let first_cert = &cert_chain[0];
-        let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-
-        let certificate = SliceReader::new(first_cert)
-            .context("Failed to parse cerficiate")?
-            .decode::<x509_cert::Certificate>()
-            .context("Failed to parse cerficiate")?;
-
-        let common_name = certificate.tbs_certificate.subject.to_string();
-
-        // We need to get the canonical name for this certificate so we can match them against any domain names
-        // seen within the proxy codebase.
-        //
-        // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
-        // We need to remove the wildcard prefix for the purposes of certificate selection.
-        //
-        // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
-        // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
-        //
-        // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
-        // validation, so let's we can continue with any common-name
-        let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
-            s.to_string()
-        } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
-            s.to_string()
-        } else if let Some(s) = common_name.strip_prefix("CN=") {
-            s.to_string()
-        } else {
-            bail!("Failed to parse common name from certificate")
-        };
-
-        let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
-
-        if is_default {
-            self.default = Some((cert.clone(), tls_server_end_point));
-        }
-
+        let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?;
         self.certs.insert(common_name, (cert, tls_server_end_point));
-
         Ok(())
     }
 
@@ -177,12 +123,82 @@ impl CertResolver {
     }
 }
 
+fn parse_key_cert(
+    key_path: &str,
+    cert_path: &str,
+) -> anyhow::Result<(PrivateKeyDer<'static>, Vec<CertificateDer<'static>>)> {
+    let priv_key = {
+        let key_bytes = std::fs::read(key_path)
+            .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
+        rustls_pemfile::private_key(&mut &key_bytes[..])
+            .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+            .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+    };
+
+    let cert_chain_bytes = std::fs::read(cert_path)
+        .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+    let cert_chain = {
+        rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+            .try_collect()
+            .with_context(|| {
+                format!(
+                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
+                )
+            })?
+    };
+
+    Ok((priv_key, cert_chain))
+}
+
+fn process_key_cert(
+    priv_key: PrivateKeyDer<'static>,
+    cert_chain: Vec<CertificateDer<'static>>,
+) -> anyhow::Result<(String, Arc<CertifiedKey>, TlsServerEndPoint)> {
+    let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
+
+    let first_cert = &cert_chain[0];
+    let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+
+    let certificate = SliceReader::new(first_cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    let common_name = certificate.tbs_certificate.subject.to_string();
+
+    // We need to get the canonical name for this certificate so we can match them against any domain names
+    // seen within the proxy codebase.
+    //
+    // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
+    // We need to remove the wildcard prefix for the purposes of certificate selection.
+    //
+    // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
+    // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
+    //
+    // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
+    // validation, so let's we can continue with any common-name
+    let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
+        s.to_string()
+    } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
+        s.to_string()
+    } else if let Some(s) = common_name.strip_prefix("CN=") {
+        s.to_string()
+    } else {
+        bail!("Failed to parse common name from certificate")
+    };
+
+    let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
+
+    Ok((common_name, cert, tls_server_end_point))
+}
+
 impl rustls::server::ResolvesServerCert for CertResolver {
     fn resolve(
         &self,
         client_hello: rustls::server::ClientHello<'_>,
     ) -> Option<Arc<rustls::sign::CertifiedKey>> {
-        self.resolve(client_hello.server_name()).map(|x| x.0)
+        Some(self.resolve(client_hello.server_name()).0)
     }
 }
 
@@ -190,7 +206,7 @@ impl CertResolver {
     pub fn resolve(
         &self,
         server_name: Option<&str>,
-    ) -> Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)> {
+    ) -> (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint) {
         // loop here and cut off more and more subdomains until we find
         // a match to get a proper wildcard support. OTOH, we now do not
         // use nested domains, so keep this simple for now.
@@ -200,12 +216,17 @@ impl CertResolver {
         if let Some(mut sni_name) = server_name {
             loop {
                 if let Some(cert) = self.certs.get(sni_name) {
-                    return Some(cert.clone());
+                    return cert.clone();
                 }
                 if let Some((_, rest)) = sni_name.split_once('.') {
                     sni_name = rest;
                 } else {
-                    return None;
+                    // The customer has some custom DNS mapping - just return
+                    // a default certificate.
+                    //
+                    // This will error if the customer uses anything stronger
+                    // than sslmode=require. That's a choice they can make.
+                    return self.default.clone();
                 }
             }
         } else {

From 0b243242df493cf3a60de6d0ec1a0d1491c3cd4c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 5 May 2025 20:15:22 +0800
Subject: [PATCH 44/76] fix(test): allow flush error in gc-compaction tests
 (#11822)

## Problem

Part of https://github.com/neondatabase/neon/issues/11762

## Summary of changes

While #11762 needs some work to refactor the error propagating thing, we
can do a hacky fix for the gc-compaction tests to allow flush error
during shutdown. It does not affect correctness.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 0dfc665a1d..370f57b19d 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -202,6 +202,8 @@ def test_pageserver_gc_compaction_preempt(
     env = neon_env_builder.init_start(initial_tenant_conf=conf)
 
     env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
+    env.pageserver.allowed_errors.append(".*flush task cancelled.*")
+    env.pageserver.allowed_errors.append(".*failed to pipe.*")
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline

From baf425a2cde7a92873b548115847445f3a43f6b0 Mon Sep 17 00:00:00 2001
From: "devin-ai-integration[bot]"
 <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 5 May 2025 15:06:37 +0200
Subject: [PATCH 45/76] [pageserver/virtual_file] impr: Improve OpenOptions API
 ergonomics (#11789)

# Improve OpenOptions API ergonomics

Closes #11787

This PR improves the OpenOptions API ergonomics by:

1. Making OpenOptions methods take and return owned Self instead of &mut
self
2. Changing VirtualFile::open_with_options_v2 to take an owned
OpenOptions
3. Removing unnecessary .clone() and .to_owned() calls

These changes make the API more idiomatic Rust by leveraging the builder
pattern with owned values, which is cleaner and more ergonomic than the
previous approach.

Link to Devin run:
https://app.devin.ai/sessions/c2a4b24f7aca40a3b3777f4259bf8ee1
Requested by: christian@neon.tech

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: christian@neon.tech <christian@neon.tech>
---
 pageserver/src/virtual_file.rs              | 42 ++++++++-------------
 pageserver/src/virtual_file/open_options.rs | 17 ++++-----
 2 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 58953407b1..f429e59ef3 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -14,8 +14,6 @@
 use std::fs::File;
 use std::io::{Error, ErrorKind};
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-#[cfg(target_os = "linux")]
-use std::os::unix::fs::OpenOptionsExt;
 use std::sync::LazyLock;
 use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 
@@ -99,7 +97,7 @@ impl VirtualFile {
 
     pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
         path: P,
-        open_options: &OpenOptions,
+        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions,
         ctx: &RequestContext,
     ) -> Result<Self, std::io::Error> {
         let mode = get_io_mode();
@@ -112,21 +110,16 @@ impl VirtualFile {
             #[cfg(target_os = "linux")]
             (IoMode::DirectRw, _) => true,
         };
-        let open_options = open_options.clone();
-        let open_options = if set_o_direct {
+        if set_o_direct {
             #[cfg(target_os = "linux")]
             {
-                let mut open_options = open_options;
-                open_options.custom_flags(nix::libc::O_DIRECT);
-                open_options
+                open_options = open_options.custom_flags(nix::libc::O_DIRECT);
             }
             #[cfg(not(target_os = "linux"))]
             unreachable!(
                 "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined"
             );
-        } else {
-            open_options
-        };
+        }
         let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
         Ok(VirtualFile { inner, _mode: mode })
     }
@@ -530,7 +523,7 @@ impl VirtualFileInner {
         path: P,
         ctx: &RequestContext,
     ) -> Result<VirtualFileInner, std::io::Error> {
-        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true).clone(), ctx).await
+        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
     }
 
     /// Open a file with given options.
@@ -558,10 +551,11 @@ impl VirtualFileInner {
         // It would perhaps be nicer to check just for the read and write flags
         // explicitly, but OpenOptions doesn't contain any functions to read flags,
         // only to set them.
-        let mut reopen_options = open_options.clone();
-        reopen_options.create(false);
-        reopen_options.create_new(false);
-        reopen_options.truncate(false);
+        let reopen_options = open_options
+            .clone()
+            .create(false)
+            .create_new(false)
+            .truncate(false);
 
         let vfile = VirtualFileInner {
             handle: RwLock::new(handle),
@@ -1307,7 +1301,7 @@ mod tests {
                 opts: OpenOptions,
                 ctx: &RequestContext,
             ) -> Result<MaybeVirtualFile, anyhow::Error> {
-                let vf = VirtualFile::open_with_options_v2(&path, &opts, ctx).await?;
+                let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?;
                 Ok(MaybeVirtualFile::VirtualFile(vf))
             }
         }
@@ -1374,7 +1368,7 @@ mod tests {
         let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err();
 
         // Close the file and re-open for reading
-        let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
+        let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?;
 
         // cannot write to a file opened in read-only mode
         let _ = file_a
@@ -1393,8 +1387,7 @@ mod tests {
                 .read(true)
                 .write(true)
                 .create(true)
-                .truncate(true)
-                .to_owned(),
+                .truncate(true),
             &ctx,
         )
         .await?;
@@ -1412,12 +1405,7 @@ mod tests {
 
         let mut vfiles = Vec::new();
         for _ in 0..100 {
-            let mut vfile = A::open(
-                path_b.clone(),
-                OpenOptions::new().read(true).to_owned(),
-                &ctx,
-            )
-            .await?;
+            let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?;
             assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?);
             vfiles.push(vfile);
         }
@@ -1466,7 +1454,7 @@ mod tests {
         for _ in 0..VIRTUAL_FILES {
             let f = VirtualFileInner::open_with_options(
                 &test_file_path,
-                OpenOptions::new().read(true).clone(),
+                OpenOptions::new().read(true),
                 &ctx,
             )
             .await?;
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index 7d323f3d8f..2a7bb693f2 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -1,6 +1,7 @@
 //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
 
 use std::os::fd::OwnedFd;
+use std::os::unix::fs::OpenOptionsExt;
 use std::path::Path;
 
 use super::io_engine::IoEngine;
@@ -43,7 +44,7 @@ impl OpenOptions {
         self.write
     }
 
-    pub fn read(&mut self, read: bool) -> &mut OpenOptions {
+    pub fn read(mut self, read: bool) -> Self {
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.read(read);
@@ -56,7 +57,7 @@ impl OpenOptions {
         self
     }
 
-    pub fn write(&mut self, write: bool) -> &mut OpenOptions {
+    pub fn write(mut self, write: bool) -> Self {
         self.write = write;
         match &mut self.inner {
             Inner::StdFs(x) => {
@@ -70,7 +71,7 @@ impl OpenOptions {
         self
     }
 
-    pub fn create(&mut self, create: bool) -> &mut OpenOptions {
+    pub fn create(mut self, create: bool) -> Self {
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.create(create);
@@ -83,7 +84,7 @@ impl OpenOptions {
         self
     }
 
-    pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
+    pub fn create_new(mut self, create_new: bool) -> Self {
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.create_new(create_new);
@@ -96,7 +97,7 @@ impl OpenOptions {
         self
     }
 
-    pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
+    pub fn truncate(mut self, truncate: bool) -> Self {
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.truncate(truncate);
@@ -124,10 +125,8 @@ impl OpenOptions {
             }
         }
     }
-}
 
-impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
-    fn mode(&mut self, mode: u32) -> &mut OpenOptions {
+    pub fn mode(mut self, mode: u32) -> Self {
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.mode(mode);
@@ -140,7 +139,7 @@ impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
         self
     }
 
-    fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
+    pub fn custom_flags(mut self, flags: i32) -> Self {
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.custom_flags(flags);

From cb67f9a6517652d21917a77f61b3e466f992d901 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 5 May 2025 16:30:13 +0200
Subject: [PATCH 46/76] delete orphan left over projects (#11826)

## Problem

sometimes our benchmarking GitHub workflow is terminated by side-effects
beyond our control (e.g. GitHub runner looses connection to server) and
then we have left-over Neon projects created during the workflow

[Example where GitHub runner lost connection and project was not
deleted](https://github.com/neondatabase/neon/actions/runs/14017400543/job/39244816485)

Fixes https://github.com/neondatabase/cloud/issues/28546

## Summary of changes

- Add a cleanup step that cleans up left-over projects
- also give each project created during workflows a name that references
the testcase and GitHub runid

## Example run (test of new job steps)


https://github.com/neondatabase/neon/actions/runs/14837092399/job/41650741922#step:6:63

---------

Co-authored-by: a-masterov <72613290+a-masterov@users.noreply.github.com>
---
 .github/workflows/benchmarking.yml            | 71 +++++++++++++++++++
 .../test_cumulative_statistics_persistence.py |  6 +-
 .../performance/test_physical_replication.py  |  8 ++-
 3 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 5107f457e2..220d7905b1 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -53,6 +53,77 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  cleanup:
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    env:
+      ORG_ID: org-solitary-dew-09443886
+      LIMIT: 100
+      SEARCH: "Created by actions/neon-project-create; GITHUB_RUN_ID"
+      BASE_URL: https://console-stage.neon.build/api/v2
+      DRY_RUN: "false"  # Set to "true" to just test out the workflow
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Cleanup inactive Neon projects left over from prior runs
+      env:
+        API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+      run: |
+        set -euo pipefail
+
+        NOW=$(date -u +%s)
+        DAYS_AGO=$((NOW - 5 * 86400))
+
+        REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID"
+
+        echo "Requesting project list from:"
+        echo "$REQUEST_URL"
+
+        response=$(curl -s -X GET "$REQUEST_URL" \
+          --header "Accept: application/json" \
+          --header "Content-Type: application/json" \
+          --header "Authorization: Bearer ${API_KEY}" )
+
+        echo "Response:"
+        echo "$response" | jq .
+
+        projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" '
+          .projects[]
+          | select(.compute_last_active_at != null)
+          | select((.compute_last_active_at | fromdateiso8601) < $cutoff)
+          | {id, name, compute_last_active_at}
+        ')
+
+        if [ -z "$projects_to_delete" ]; then
+          echo "No projects eligible for deletion."
+          exit 0
+        fi
+
+        echo "Projects that will be deleted:"
+        echo "$projects_to_delete" | jq -r '.id'
+
+        if [ "$DRY_RUN" = "false" ]; then
+          echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do
+            echo "Deleting project: $project_id"
+            curl -s -X DELETE "$BASE_URL/projects/$project_id" \
+              --header "Accept: application/json" \
+              --header "Content-Type: application/json" \
+              --header "Authorization: Bearer ${API_KEY}" 
+          done
+        else
+          echo "Dry run enabled — no projects were deleted."
+        fi
   bench:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     permissions:
diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py
index 061467bbad..5e9e55cb0f 100644
--- a/test_runner/performance/test_cumulative_statistics_persistence.py
+++ b/test_runner/performance/test_cumulative_statistics_persistence.py
@@ -1,4 +1,5 @@
 import math  # Add this import
+import os
 import time
 import traceback
 from pathlib import Path
@@ -87,7 +88,10 @@ def test_cumulative_statistics_persistence(
     - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are
     - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension
     """
-    project = neon_api.create_project(pg_version)
+    project = neon_api.create_project(
+        pg_version,
+        f"Test cumulative statistics persistence, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}",
+    )
     project_id = project["project"]["id"]
     neon_api.wait_for_operation_to_finish(project_id)
     endpoint_id = project["endpoints"][0]["id"]
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index bdafa2d657..c580bfcc14 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -62,7 +62,9 @@ def test_ro_replica_lag(
 
     pgbench_duration = f"-T{test_duration_min * 60 * 2}"
 
-    project = neon_api.create_project(pg_version)
+    project = neon_api.create_project(
+        pg_version, f"Test readonly replica lag, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    )
     project_id = project["project"]["id"]
     log.info("Project ID: %s", project_id)
     log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"])
@@ -195,7 +197,9 @@ def test_replication_start_stop(
     pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}"
     error_occurred = False
 
-    project = neon_api.create_project(pg_version)
+    project = neon_api.create_project(
+        pg_version, f"Test replication start stop, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    )
     project_id = project["project"]["id"]
     log.info("Project ID: %s", project_id)
     log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"])

From 16ca74a3f41789a1136e1256a8b9e5ec59b33417 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 6 May 2025 09:49:23 +0300
Subject: [PATCH 47/76] Add SAFETY comment on libc::sysconf() call (#11581)

I got an 'undocumented_unsafe_blocks' clippy warning about it. Not sure
why I got the warning now and not before, but in any case a comment is a
good idea.
---
 libs/metrics/src/more_process_metrics.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs
index 13a745e031..f91800685f 100644
--- a/libs/metrics/src/more_process_metrics.rs
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -16,6 +16,7 @@ pub struct Collector {
 const NMETRICS: usize = 2;
 
 static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
+    // SAFETY: libc::sysconf is safe, it merely returns a value.
     let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
     if long == -1 {
         panic!("sysconf(_SC_CLK_TCK) failed");

From c6ff18affccc73372078c393163f06a88b871820 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Tue, 6 May 2025 10:51:51 +0400
Subject: [PATCH 48/76] cosmetics(pgxn/neon): WP code small clean up  (#11824)

## Problem
Some small cosmetic changes I made while reading the code. Should not
affect anything.

## Summary of changes
- Remove `n_votes` field because it's not used anymore
- Explicitly initialize `safekeepers_generation` with
`INVALID_GENERATION` if the generation is not present (the struct is
zero-initialized anyway, but the explicit initialization is better IMHO)
- Access SafekeeperId via pointer `sk_id` created above
---
 pgxn/neon/neon_walreader.c | 2 +-
 pgxn/neon/walproposer.c    | 6 +++---
 pgxn/neon/walproposer.h    | 3 ---
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index be2c4ddf79..d5e3a38dbb 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -150,7 +150,7 @@ NeonWALReaderFree(NeonWALReader *state)
  * fetched from timeline 'tli'.
  *
  * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
- * occurs, in which case 'err' has the desciption. Error always closes remote
+ * occurs, in which case 'err' has the description. Error always closes remote
  * connection, if there was any, so socket subscription should be removed.
  *
  * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index f4f1398375..3befb42030 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -124,6 +124,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	else
 	{
+		wp->safekeepers_generation = INVALID_GENERATION;
 		host = wp->config->safekeepers_list;
 	}
 	wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);
@@ -756,7 +757,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
 	{
 		SafekeeperId *sk_id = &wp->mconf.members.m[i];
 
-		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		if (sk_id->node_id == sk->greetResponse.nodeId)
 		{
 			/*
 			 * If mconf or list of safekeepers to connect to changed (the
@@ -781,7 +782,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
 	{
 		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
 
-		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		if (sk_id->node_id == sk->greetResponse.nodeId)
 		{
 			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
 			{
@@ -1071,7 +1072,6 @@ RecvVoteResponse(Safekeeper *sk)
 	/* ready for elected message */
 	sk->state = SS_WAIT_ELECTED;
 
-	wp->n_votes++;
 	/* Are we already elected? */
 	if (wp->state == WPS_CAMPAIGN)
 	{
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 648b0015ad..83ef72d3d7 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -845,9 +845,6 @@ typedef struct WalProposer
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
 
-	/* number of votes collected from safekeepers */
-	int			n_votes;
-
 	/* number of successful connections over the lifetime of walproposer */
 	int			n_connected;
 

From f0e7b3e0efc964539c9561b5976922e9c1cccbcb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 6 May 2025 10:24:27 +0300
Subject: [PATCH 49/76] Use unlogged build for
 gist_indexsortbuild_flush_ready_pages (#11753)

## Problem

See https://github.com/neondatabase/neon/issues/11718

GIST index can be constructed in two ways: GIST_SORTED_BUILD and
GIST_BUFFERING.
We used unlogged build in the second case but not in the first.

## Summary of changes

Use unlogged build in `gist_indexsortbuild_flush_ready_pages`

Correspondent Postgres PRsL:
https://github.com/neondatabase/postgres/pull/624
https://github.com/neondatabase/postgres/pull/625
https://github.com/neondatabase/postgres/pull/626

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/pagestore_smgr.c       |  6 ++++++
 test_runner/regress/test_gist.py | 28 ++++++++++++++++++++++++++++
 vendor/postgres-v14              |  2 +-
 vendor/postgres-v15              |  2 +-
 vendor/postgres-v16              |  2 +-
 vendor/revisions.json            |  6 +++---
 6 files changed, 40 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_gist.py

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 3bf0bedf99..87eb420717 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1989,8 +1989,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+#if PG_MAJORVERSION_NUM >= 17
+	/*
+	 * We have to disable this check for pg14-16 because sorted build of GIST index requires
+	 * to perform unlogged build several times
+	 */
 	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
 		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
+#endif
 
 	unlogged_build_rel = reln;
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
diff --git a/test_runner/regress/test_gist.py b/test_runner/regress/test_gist.py
new file mode 100644
index 0000000000..89e3b9b2b1
--- /dev/null
+++ b/test_runner/regress/test_gist.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
+
+
+#
+# Test unlogged build for GIST index
+#
+def test_gist(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+    con = endpoint.connect()
+    cur = con.cursor()
+    iterations = 100
+
+    for _ in range(iterations):
+        cur.execute(
+            "CREATE TABLE pvactst (i INT, a INT[], p POINT) with (autovacuum_enabled = off)"
+        )
+        cur.execute(
+            "INSERT INTO pvactst SELECT i, array[1,2,3], point(i, i+1) FROM generate_series(1,1000) i"
+        )
+        cur.execute("CREATE INDEX gist_pvactst ON pvactst USING gist (p)")
+        cur.execute("VACUUM pvactst")
+        cur.execute("DROP TABLE pvactst")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index d3c9d61fb7..c8dab02bfc 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit d3c9d61fb7a362a165dac7060819dd9d6ad68c28
+Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8ecb12f21d..b838c8969b 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3
+Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 37496f87b5..05ddf212e2 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 37496f87b5324af53c56127e278ee5b1e8435253
+Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 90d878d0f7..74a6ff33d7 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,14 +5,14 @@
   ],
   "v16": [
     "16.8",
-    "37496f87b5324af53c56127e278ee5b1e8435253"
+    "05ddf212e2e07b788b5c8b88bdcf98630941f6ae"
   ],
   "v15": [
     "15.12",
-    "8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3"
+    "b838c8969b7c63f3e637a769656f5f36793b797c"
   ],
   "v14": [
     "14.17",
-    "d3c9d61fb7a362a165dac7060819dd9d6ad68c28"
+    "c8dab02bfc003ae7bd59096919042d7840f3c194"
   ]
 }

From 62ac5b94b3842322f267df28ff90ad3c9d4060a7 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 6 May 2025 09:28:25 +0000
Subject: [PATCH 50/76] proxy: Include the exp/nbf timestamps in the errors
 (#11828)

## Problem

It's difficult to tell when the JWT expired from current logs and error
messages.

## Summary of changes

Add exp/nbf timestamps to the respective error variants.
Also use checked_add when deserializing a SystemTime from JWT.

Related to INC-509
---
 proxy/src/auth/backend/jwt.rs | 41 ++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 44a6a42665..a48f67199a 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -409,14 +409,22 @@ impl JwkCacheEntryLock {
 
         if let Some(exp) = payload.expiration {
             if now >= exp + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
+                return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired(
+                    exp.duration_since(SystemTime::UNIX_EPOCH)
+                        .unwrap_or_default()
+                        .as_secs(),
+                )));
             }
         }
 
         if let Some(nbf) = payload.not_before {
             if nbf >= now + CLOCK_SKEW_LEEWAY {
                 return Err(JwtError::InvalidClaims(
-                    JwtClaimsError::JwtTokenNotYetReadyToUse,
+                    JwtClaimsError::JwtTokenNotYetReadyToUse(
+                        nbf.duration_since(SystemTime::UNIX_EPOCH)
+                            .unwrap_or_default()
+                            .as_secs(),
+                    ),
                 ));
             }
         }
@@ -534,10 +542,10 @@ struct JwtPayload<'a> {
     #[serde(rename = "aud", default)]
     audience: OneOrMany,
     /// Expiration - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
+    #[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)]
     expiration: Option<SystemTime>,
-    /// Not before - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
+    /// Not before - Time before which the JWT is not valid
+    #[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)]
     not_before: Option<SystemTime>,
 
     // the following entries are only extracted for the sake of debug logging.
@@ -609,8 +617,15 @@ impl<'de> Deserialize<'de> for OneOrMany {
 }
 
 fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
-    let d = <Option<u64>>::deserialize(d)?;
-    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
+    <Option<u64>>::deserialize(d)?
+        .map(|t| {
+            SystemTime::UNIX_EPOCH
+                .checked_add(Duration::from_secs(t))
+                .ok_or_else(|| {
+                    serde::de::Error::custom(format_args!("timestamp out of bounds: {t}"))
+                })
+        })
+        .transpose()
 }
 
 struct JwkRenewalPermit<'a> {
@@ -746,11 +761,11 @@ pub enum JwtClaimsError {
     #[error("invalid JWT token audience")]
     InvalidJwtTokenAudience,
 
-    #[error("JWT token has expired")]
-    JwtTokenHasExpired,
+    #[error("JWT token has expired (exp={0})")]
+    JwtTokenHasExpired(u64),
 
-    #[error("JWT token is not yet ready to use")]
-    JwtTokenNotYetReadyToUse,
+    #[error("JWT token is not yet ready to use (nbf={0})")]
+    JwtTokenNotYetReadyToUse(u64),
 }
 
 #[allow(dead_code, reason = "Debug use only")]
@@ -1233,14 +1248,14 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                     "nbf": now + 60,
                     "aud": "neon",
                 }},
-                error: JwtClaimsError::JwtTokenNotYetReadyToUse,
+                error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60),
             },
             Test {
                 body: json! {{
                     "exp": now - 60,
                     "aud": ["neon"],
                 }},
-                error: JwtClaimsError::JwtTokenHasExpired,
+                error: JwtClaimsError::JwtTokenHasExpired(now - 60),
             },
             Test {
                 body: json! {{

From 50dc2fae771c6720b77eeb431e90a2cb300b9b5c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 6 May 2025 11:52:21 +0100
Subject: [PATCH 51/76] compute-node.Dockerfile: remove layer with duplicated
 name (#11807)

## Problem

Two `rust-extensions-build-pgrx14` layers were added independently in
two different PRs, and the layers are exactly the same

## Summary of changes
- Remove one of `rust-extensions-build-pgrx14` layers
---
 compute/compute-node.Dockerfile | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index cc338cec6a..8766eb519e 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1084,23 +1084,12 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
+
 #########################################################################################
 #
 # Layer "rust extensions pgrx14"
 #
-#########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
-ARG PG_VERSION
-
-RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
-    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
-
-USER root
-#########################################################################################
-#
-# Layer "rust extensions pgrx14"
-#
-# Version 14 is now required by a few 
+# Version 14 is now required by a few
 # This layer should be used as a base for new pgrx extensions,
 # and eventually get merged with `rust-extensions-build`
 #

From c82e363ed90742214de1bf5efb7a721019e89d42 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 6 May 2025 14:26:13 +0200
Subject: [PATCH 52/76] cleanup orphan projects created by python tests, too
 (#11836)

## Problem

- some projects are created during GitHub workflows but not by action
project_create but by python test scripts.

If the python test fails the project is not deleted

## Summary of changes

- make sure we cleanup those python created projects a few days after
they are no longer used, too
---
 .github/workflows/benchmarking.yml        | 2 +-
 test_runner/random_ops/test_random_ops.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 220d7905b1..79371ec704 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -64,7 +64,7 @@ jobs:
     env:
       ORG_ID: org-solitary-dew-09443886
       LIMIT: 100
-      SEARCH: "Created by actions/neon-project-create; GITHUB_RUN_ID"
+      SEARCH: "GITHUB_RUN_ID="
       BASE_URL: https://console-stage.neon.build/api/v2
       DRY_RUN: "false"  # Set to "true" to just test out the workflow
 
diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py
index 643151fa11..645c9b7b9d 100644
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -206,7 +206,7 @@ class NeonProject:
         self.neon_api = neon_api
         self.pg_bin = pg_bin
         proj = self.neon_api.create_project(
-            pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}"
+            pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
         )
         self.id: str = proj["project"]["id"]
         self.name: str = proj["project"]["name"]

From 6827f2f58ccb8dec0922d0a2c7a413998cf2f539 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 6 May 2025 20:27:16 +0800
Subject: [PATCH 53/76] fix(pageserver): only keep `iter_with_options` API,
 improve docs in gc-compact (#11804)

## Problem

Address comments in https://github.com/neondatabase/neon/pull/11709

## Summary of changes

- remove `iter` API, users always need to specify buffer size depending
on the expected memory usage.
- several doc improvements

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   | 15 +-----
 .../tenant/storage_layer/filter_iterator.rs   |  4 +-
 .../src/tenant/storage_layer/image_layer.rs   | 15 +-----
 .../tenant/storage_layer/merge_iterator.rs    | 46 +++++++++++--------
 pageserver/src/tenant/timeline/compaction.rs  | 13 ++++--
 5 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 607b0d513c..11875ac653 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1441,14 +1441,6 @@ impl DeltaLayerInner {
         offset
     }
 
-    pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
-        self.iter_with_options(
-            ctx,
-            1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
-            1024,        // The default value. Unit tests might use a different value
-        )
-    }
-
     pub fn iter_with_options<'a>(
         &'a self,
         ctx: &'a RequestContext,
@@ -1634,7 +1626,6 @@ pub(crate) mod test {
     use crate::tenant::disk_btree::tests::TestDisk;
     use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
     use crate::tenant::storage_layer::{Layer, ResidentLayer};
-    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
     use crate::tenant::{TenantShard, Timeline};
 
     /// Construct an index for a fictional delta layer and and then
@@ -2311,8 +2302,7 @@ pub(crate) mod test {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
-                let mut iter = delta_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size);
                 let mut num_items = 0;
                 for _ in 0..3 {
                     iter.next_batch().await.unwrap();
@@ -2329,8 +2319,7 @@ pub(crate) mod test {
                     iter.key_values_batch.clear();
                 }
                 // Test if the result is correct
-                let mut iter = delta_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size);
                 assert_delta_iter_equal(&mut iter, &test_deltas).await;
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
index 8d172a1c19..1a330ecfc2 100644
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -157,7 +157,7 @@ mod tests {
             .await
             .unwrap();
 
-        let merge_iter = MergeIterator::create(
+        let merge_iter = MergeIterator::create_for_testing(
             &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
             &[],
             &ctx,
@@ -182,7 +182,7 @@ mod tests {
         result.extend(test_deltas1[90..100].iter().cloned());
         assert_filter_iter_equal(&mut filter_iter, &result).await;
 
-        let merge_iter = MergeIterator::create(
+        let merge_iter = MergeIterator::create_for_testing(
             &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
             &[],
             &ctx,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 2f7c5715bb..d684230572 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -684,14 +684,6 @@ impl ImageLayerInner {
         }
     }
 
-    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        self.iter_with_options(
-            ctx,
-            1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
-            1024,        // The default value. Unit tests might use a different value
-        )
-    }
-
     pub(crate) fn iter_with_options<'a>(
         &'a self,
         ctx: &'a RequestContext,
@@ -1240,7 +1232,6 @@ mod test {
     use crate::context::RequestContext;
     use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
     use crate::tenant::storage_layer::{Layer, ResidentLayer};
-    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
     use crate::tenant::{TenantShard, Timeline};
 
     #[tokio::test]
@@ -1507,8 +1498,7 @@ mod test {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
-                let mut iter = img_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size);
                 let mut num_items = 0;
                 for _ in 0..3 {
                     iter.next_batch().await.unwrap();
@@ -1525,8 +1515,7 @@ mod test {
                     iter.key_values_batch.clear();
                 }
                 // Test if the result is correct
-                let mut iter = img_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size);
                 assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await;
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index e084e3d567..ea3dea50c3 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -19,14 +19,6 @@ pub(crate) enum LayerRef<'a> {
 }
 
 impl<'a> LayerRef<'a> {
-    #[allow(dead_code)]
-    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
-        match self {
-            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
-            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
-        }
-    }
-
     fn iter_with_options(
         self,
         ctx: &'a RequestContext,
@@ -322,6 +314,28 @@ impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
 }
 
 impl<'a> MergeIterator<'a> {
+    #[cfg(test)]
+    pub(crate) fn create_for_testing(
+        deltas: &[&'a DeltaLayerInner],
+        images: &[&'a ImageLayerInner],
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024)
+    }
+
+    /// Create a new merge iterator with custom options.
+    ///
+    /// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale
+    /// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that
+    /// the buffer does not take too much memory.
+    ///
+    /// The default options for L0 compactions are:
+    /// - max_read_size: 1024 * 8192 (8MB)
+    /// - max_batch_size: 1024
+    ///
+    /// The default options for gc-compaction are:
+    /// - max_read_size: 128 * 8192 (1MB)
+    /// - max_batch_size: 128
     pub fn create_with_options(
         deltas: &[&'a DeltaLayerInner],
         images: &[&'a ImageLayerInner],
@@ -351,14 +365,6 @@ impl<'a> MergeIterator<'a> {
         }
     }
 
-    pub fn create(
-        deltas: &[&'a DeltaLayerInner],
-        images: &[&'a ImageLayerInner],
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024)
-    }
-
     pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
         while let Some(mut iter) = self.heap.peek_mut() {
             if !iter.is_loaded() {
@@ -477,7 +483,7 @@ mod tests {
         let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
             .await
             .unwrap();
-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
             &[
                 resident_layer_2.get_as_delta(&ctx).await.unwrap(),
                 resident_layer_1.get_as_delta(&ctx).await.unwrap(),
@@ -549,7 +555,7 @@ mod tests {
         let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
             .await
             .unwrap();
-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
             &[
                 resident_layer_1.get_as_delta(&ctx).await.unwrap(),
                 resident_layer_2.get_as_delta(&ctx).await.unwrap(),
@@ -670,7 +676,7 @@ mod tests {
         // Test with different layer order for MergeIterator::create to ensure the order
         // is stable.
 
-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
             &[
                 resident_layer_4.get_as_delta(&ctx).await.unwrap(),
                 resident_layer_1.get_as_delta(&ctx).await.unwrap(),
@@ -682,7 +688,7 @@ mod tests {
         );
         assert_merge_iter_equal(&mut merge_iter, &expect).await;
 
-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
             &[
                 resident_layer_1.get_as_delta(&ctx).await.unwrap(),
                 resident_layer_4.get_as_delta(&ctx).await.unwrap(),
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9086d29d50..d0c13d86ce 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1994,7 +1994,13 @@ impl Timeline {
                 let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
                 deltas.push(l);
             }
-            MergeIterator::create(&deltas, &[], ctx)
+            MergeIterator::create_with_options(
+                &deltas,
+                &[],
+                ctx,
+                1024 * 8192, /* 8 MiB buffer per layer iterator */
+                1024,
+            )
         };
 
         // This iterator walks through all keys and is needed to calculate size used by each key
@@ -2828,7 +2834,7 @@ impl Timeline {
         Ok(())
     }
 
-    /// Check if the memory usage is within the limit.
+    /// Check to bail out of gc compaction early if it would use too much memory.
     async fn check_memory_usage(
         self: &Arc<Self>,
         layer_selection: &[Layer],
@@ -2841,7 +2847,8 @@ impl Timeline {
             let layer_desc = layer.layer_desc();
             if layer_desc.is_delta() {
                 // Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB).
-                // Multiply the layer size so that tests can pass.
+                // Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt
+                // use 3MB layer size and we need to account for that).
                 estimated_memory_usage_mb +=
                     3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64;
                 num_delta_layers += 1;

From 0e0ad073bf609fbc38e86f4030f1902c2632c5f7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 6 May 2025 15:57:34 +0200
Subject: [PATCH 54/76] storcon: fix split aborts removing other tenants
 (#11837)

## Problem

When aborting a split, the code accidentally removes all other tenant
shards from the in-memory map that have the same shard count as the
aborted split, causing "tenant not found" errors. It will recover on a
storcon restart, when it loads the persisted state. This issue has been
present for at least a year.

Resolves https://github.com/neondatabase/cloud/issues/28589.

## Summary of changes

Only remove shards belonging to the relevant tenant when aborting a
split.

Also adds a regression test.
---
 storage_controller/src/service.rs    |  3 ++-
 test_runner/regress/test_sharding.py | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 72379f0810..21c693af97 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5181,7 +5181,8 @@ impl Service {
             }
 
             // We don't expect any new_shard_count shards to exist here, but drop them just in case
-            tenants.retain(|_id, s| s.shard.count != *new_shard_count);
+            tenants
+                .retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count));
 
             detach_locations
         };
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 0bfc4b1d8c..4c9887fb92 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1334,6 +1334,13 @@ def test_sharding_split_failures(
         tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}'
     )
 
+    # Create bystander tenants with various shard counts. They should not be affected by the aborted
+    # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589.
+    bystanders = {}  # id → shard_count
+    for bystander_shard_count in [1, 2, 4, 8]:
+        id, _ = env.create_tenant(shard_count=bystander_shard_count)
+        bystanders[id] = bystander_shard_count
+
     env.storage_controller.allowed_errors.extend(
         [
             # All split failures log a warning when then enqueue the abort operation
@@ -1394,6 +1401,8 @@ def test_sharding_split_failures(
             locations = ps.http_client().tenant_list_locations()["tenant_shards"]
             for loc in locations:
                 tenant_shard_id = TenantShardId.parse(loc[0])
+                if tenant_shard_id.tenant_id != tenant_id:
+                    continue  # skip bystanders
                 log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
                 assert tenant_shard_id.shard_count == initial_shard_count
                 if loc[1]["mode"] == "Secondary":
@@ -1414,6 +1423,8 @@ def test_sharding_split_failures(
             locations = ps.http_client().tenant_list_locations()["tenant_shards"]
             for loc in locations:
                 tenant_shard_id = TenantShardId.parse(loc[0])
+                if tenant_shard_id.tenant_id != tenant_id:
+                    continue  # skip bystanders
                 log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
                 assert tenant_shard_id.shard_count == split_shard_count
                 if loc[1]["mode"] == "Secondary":
@@ -1496,6 +1507,12 @@ def test_sharding_split_failures(
     # the scheduler reaches an idle state
     env.storage_controller.reconcile_until_idle(timeout_secs=30)
 
+    # Check that all bystanders are still around.
+    for bystander_id, bystander_shard_count in bystanders.items():
+        response = env.storage_controller.tenant_describe(bystander_id)
+        assert TenantId(response["tenant_id"]) == bystander_id
+        assert len(response["shards"]) == bystander_shard_count
+
     env.storage_controller.consistency_check()
 
 
From 79ee78ea32754ed2b1cfa360c55a6a2aa11ad871 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Tue, 6 May 2025 17:18:50 +0200
Subject: [PATCH 55/76] feat(compute): enable audit logs for pg_session_jwt
 extension (#11829)

related to https://github.com/neondatabase/cloud/issues/28480
related to https://github.com/neondatabase/pg_session_jwt/pull/36

cc @MihaiBojin @conradludgate @lneves12
---
 compute/compute-node.Dockerfile                               | 4 ++--
 compute_tools/src/config.rs                                   | 3 +++
 .../ext-src/pg_session_jwt-src/expected/basic_functions.out   | 1 +
 proxy/src/serverless/local_conn_pool.rs                       | 2 +-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 8766eb519e..8bdf5cb7d1 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1322,8 +1322,8 @@ ARG PG_VERSION
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
 WORKDIR /ext-src
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.1.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "62fec9e472cb805c53ba24a0765afdb8ea2720cfc03ae7813e61687b36d1b0ad pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 71c6123c3b..42d245f55a 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -223,6 +223,9 @@ pub fn write_postgres_conf(
             // TODO: tune this after performance testing
             writeln!(file, "pgaudit.log_rotation_age=5")?;
 
+            // Enable audit logs for pg_session_jwt extension
+            writeln!(file, "pg_session_jwt.audit_log=on")?;
+
             // Add audit shared_preload_libraries, if they are not present.
             //
             // The caller who sets the flag is responsible for ensuring that the necessary
diff --git a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
index ca54864ecd..ff6a7404cb 100644
--- a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
+++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
@@ -12,6 +12,7 @@ ERROR:  invalid JWT encoding
 -- Test creating a session with an expired JWT
 SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
 ERROR:  Token used after it has expired
+DETAIL:  exp=1742564432
 -- Test creating a session with a valid JWT
 SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
  jwt_session_init 
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 1d9b35f41d..bb5637cd5f 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.3.0";
+pub(crate) const EXT_VERSION: &str = "0.3.1";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
 #[derive(Clone)]

From f9b3a2e059c4c1fd764da011e7a5364d86e60491 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 May 2025 14:51:10 -0500
Subject: [PATCH 56/76] Add scoping to compute_ctl JWT claims (#11639)

Currently we only have an admin scope which allows a user to bypass the
compute_id check. When the admin scope is provided, validate the
audience of the JWT to be "compute".

Closes: https://github.com/neondatabase/cloud/issues/27614

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .../src/http/middleware/authorize.rs          | 59 +++++++++++---
 control_plane/src/bin/neon_local.rs           | 20 +++--
 control_plane/src/endpoint.rs                 | 20 +++--
 libs/compute_api/src/requests.rs              | 41 +++++++++-
 test_runner/fixtures/endpoint/http.py         | 12 +++
 test_runner/fixtures/neon_cli.py              |  7 +-
 test_runner/fixtures/neon_fixtures.py         | 12 ++-
 test_runner/regress/test_compute_http.py      | 78 +++++++++++++++++++
 8 files changed, 222 insertions(+), 27 deletions(-)
 create mode 100644 test_runner/regress/test_compute_http.py

diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index 2d0f411d7a..2afc57ad9c 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -1,12 +1,10 @@
-use std::collections::HashSet;
-
 use anyhow::{Result, anyhow};
 use axum::{RequestExt, body::Body};
 use axum_extra::{
     TypedHeader,
     headers::{Authorization, authorization::Bearer},
 };
-use compute_api::requests::ComputeClaims;
+use compute_api::requests::{COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope};
 use futures::future::BoxFuture;
 use http::{Request, Response, StatusCode};
 use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
@@ -25,13 +23,14 @@ pub(in crate::http) struct Authorize {
 impl Authorize {
     pub fn new(compute_id: String, jwks: JwkSet) -> Self {
         let mut validation = Validation::new(Algorithm::EdDSA);
-        // Nothing is currently required
-        validation.required_spec_claims = HashSet::new();
         validation.validate_exp = true;
         // Unused by the control plane
-        validation.validate_aud = false;
-        // Unused by the control plane
         validation.validate_nbf = false;
+        // Unused by the control plane
+        validation.validate_aud = false;
+        validation.set_audience(&[COMPUTE_AUDIENCE]);
+        // Nothing is currently required
+        validation.set_required_spec_claims(&[] as &[&str; 0]);
 
         Self {
             compute_id,
@@ -64,11 +63,47 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
                 Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
             };
 
-            if data.claims.compute_id != compute_id {
-                return Err(JsonResponse::error(
-                    StatusCode::UNAUTHORIZED,
-                    "invalid compute ID in authorization token claims",
-                ));
+            match data.claims.scope {
+                // TODO: We should validate audience for every token, but
+                // instead of this ad-hoc validation, we should turn
+                // [`Validation::validate_aud`] on. This is merely a stopgap
+                // while we roll out `aud` deployment. We return a 401
+                // Unauthorized because when we eventually do use
+                // [`Validation`], we will hit the above `Err` match arm which
+                // returns 401 Unauthorized.
+                Some(ComputeClaimsScope::Admin) => {
+                    let Some(ref audience) = data.claims.audience else {
+                        return Err(JsonResponse::error(
+                            StatusCode::UNAUTHORIZED,
+                            "missing audience in authorization token claims",
+                        ));
+                    };
+
+                    if audience != COMPUTE_AUDIENCE {
+                        return Err(JsonResponse::error(
+                            StatusCode::UNAUTHORIZED,
+                            "invalid audience in authorization token claims",
+                        ));
+                    }
+                }
+
+                // If the scope is not [`ComputeClaimsScope::Admin`], then we
+                // must validate the compute_id
+                _ => {
+                    let Some(ref claimed_compute_id) = data.claims.compute_id else {
+                        return Err(JsonResponse::error(
+                            StatusCode::FORBIDDEN,
+                            "missing compute_id in authorization token claims",
+                        ));
+                    };
+
+                    if *claimed_compute_id != compute_id {
+                        return Err(JsonResponse::error(
+                            StatusCode::FORBIDDEN,
+                            "invalid compute ID in authorization token claims",
+                        ));
+                    }
+                }
             }
 
             // Make claims available to any subsequent middleware or request
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6f55c0310f..44698f7b23 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -16,6 +16,7 @@ use std::time::Duration;
 
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
+use compute_api::requests::ComputeClaimsScope;
 use compute_api::spec::ComputeMode;
 use control_plane::broker::StorageBroker;
 use control_plane::endpoint::ComputeControlPlane;
@@ -705,6 +706,9 @@ struct EndpointStopCmdArgs {
 struct EndpointGenerateJwtCmdArgs {
     #[clap(help = "Postgres endpoint id")]
     endpoint_id: String,
+
+    #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)]
+    scope: Option<ComputeClaimsScope>,
 }
 
 #[derive(clap::Subcommand)]
@@ -1540,12 +1544,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             endpoint.stop(&args.mode, args.destroy)?;
         }
         EndpointCmd::GenerateJwt(args) => {
-            let endpoint_id = &args.endpoint_id;
-            let endpoint = cplane
-                .endpoints
-                .get(endpoint_id)
-                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let jwt = endpoint.generate_jwt()?;
+            let endpoint = {
+                let endpoint_id = &args.endpoint_id;
+
+                cplane
+                    .endpoints
+                    .get(endpoint_id)
+                    .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?
+            };
+
+            let jwt = endpoint.generate_jwt(args.scope)?;
 
             print!("{jwt}");
         }
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 4071b620d6..0b16339a6f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -45,7 +45,9 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use anyhow::{Context, Result, anyhow, bail};
-use compute_api::requests::{ComputeClaims, ConfigurationRequest};
+use compute_api::requests::{
+    COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest,
+};
 use compute_api::responses::{
     ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
 };
@@ -630,9 +632,17 @@ impl Endpoint {
     }
 
     /// Generate a JWT with the correct claims.
-    pub fn generate_jwt(&self) -> Result<String> {
+    pub fn generate_jwt(&self, scope: Option<ComputeClaimsScope>) -> Result<String> {
         self.env.generate_auth_token(&ComputeClaims {
-            compute_id: self.endpoint_id.clone(),
+            audience: match scope {
+                Some(ComputeClaimsScope::Admin) => Some(COMPUTE_AUDIENCE.to_owned()),
+                _ => Some(self.endpoint_id.clone()),
+            },
+            compute_id: match scope {
+                Some(ComputeClaimsScope::Admin) => None,
+                _ => Some(self.endpoint_id.clone()),
+            },
+            scope,
         })
     }
 
@@ -903,7 +913,7 @@ impl Endpoint {
                     self.external_http_address.port()
                 ),
             )
-            .bearer_auth(self.generate_jwt()?)
+            .bearer_auth(self.generate_jwt(None::<ComputeClaimsScope>)?)
             .send()
             .await?;
 
@@ -980,7 +990,7 @@ impl Endpoint {
                 self.external_http_address.port()
             ))
             .header(CONTENT_TYPE.as_str(), "application/json")
-            .bearer_auth(self.generate_jwt()?)
+            .bearer_auth(self.generate_jwt(None::<ComputeClaimsScope>)?)
             .body(
                 serde_json::to_string(&ConfigurationRequest {
                     spec,
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index 98f2fc297c..40d34eccea 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,16 +1,55 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
+use std::str::FromStr;
+
 use serde::{Deserialize, Serialize};
 
 use crate::privilege::Privilege;
 use crate::responses::ComputeCtlConfig;
 use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
 
+/// The value to place in the [`ComputeClaims::audience`] claim.
+pub static COMPUTE_AUDIENCE: &str = "compute";
+
+#[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+#[serde(rename_all = "snake_case")]
+/// Available scopes for a compute's JWT.
+pub enum ComputeClaimsScope {
+    /// An admin-scoped token allows access to all of `compute_ctl`'s authorized
+    /// facilities.
+    Admin,
+}
+
+impl FromStr for ComputeClaimsScope {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "admin" => Ok(ComputeClaimsScope::Admin),
+            _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")),
+        }
+    }
+}
+
 /// When making requests to the `compute_ctl` external HTTP server, the client
 /// must specify a set of claims in `Authorization` header JWTs such that
 /// `compute_ctl` can authorize the request.
 #[derive(Clone, Debug, Deserialize, Serialize)]
+#[serde(rename = "snake_case")]
 pub struct ComputeClaims {
-    pub compute_id: String,
+    /// The compute ID that will validate the token. The only case in which this
+    /// can be [`None`] is if [`Self::scope`] is
+    /// [`ComputeClaimsScope::Admin`].
+    pub compute_id: Option<String>,
+
+    /// The scope of what the token authorizes.
+    pub scope: Option<ComputeClaimsScope>,
+
+    /// The recipient the token is intended for.
+    ///
+    /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for
+    /// more information.
+    #[serde(rename = "aud")]
+    pub audience: Option<String>,
 }
 
 /// Request of the /configure API
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 652c38f5c3..beed1dcd93 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import urllib.parse
+from enum import StrEnum
 from typing import TYPE_CHECKING, final
 
 import requests
@@ -14,6 +15,17 @@ if TYPE_CHECKING:
     from requests import PreparedRequest
 
 
+COMPUTE_AUDIENCE = "compute"
+"""
+The value to place in the `aud` claim.
+"""
+
+
+@final
+class ComputeClaimsScope(StrEnum):
+    ADMIN = "admin"
+
+
 @final
 class BearerAuth(AuthBase):
     """
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index b5d69b5ab6..3be78719d7 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
         Any,
     )
 
+    from fixtures.endpoint.http import ComputeClaimsScope
     from fixtures.pg_version import PgVersion
 
 
@@ -535,12 +536,16 @@ class NeonLocalCli(AbstractNeonCli):
         res.check_returncode()
         return res
 
-    def endpoint_generate_jwt(self, endpoint_id: str) -> str:
+    def endpoint_generate_jwt(
+        self, endpoint_id: str, scope: ComputeClaimsScope | None = None
+    ) -> str:
         """
         Generate a JWT for making requests to the endpoint's external HTTP
         server.
         """
         args = ["endpoint", "generate-jwt", endpoint_id]
+        if scope:
+            args += ["--scope", str(scope)]
 
         cmd = self.raw_cli(args)
         cmd.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 47d1228c61..133be5c045 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -51,7 +51,7 @@ from fixtures.common_types import (
     TimelineId,
 )
 from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS
-from fixtures.endpoint.http import EndpointHttpClient
+from fixtures.endpoint.http import ComputeClaimsScope, EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.neon_cli import NeonLocalCli, Pagectl
@@ -4218,7 +4218,7 @@ class Endpoint(PgProtocol, LogUtils):
 
         self.config(config_lines)
 
-        self.__jwt = self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id)
+        self.__jwt = self.generate_jwt()
 
         return self
 
@@ -4265,6 +4265,14 @@ class Endpoint(PgProtocol, LogUtils):
 
         return self
 
+    def generate_jwt(self, scope: ComputeClaimsScope | None = None) -> str:
+        """
+        Generate a JWT for making requests to the endpoint's external HTTP
+        server.
+        """
+        assert self.endpoint_id is not None
+        return self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id, scope)
+
     def endpoint_path(self) -> Path:
         """Path to endpoint directory"""
         assert self.endpoint_id
diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py
new file mode 100644
index 0000000000..ce31ff0fe6
--- /dev/null
+++ b/test_runner/regress/test_compute_http.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+from http.client import FORBIDDEN, UNAUTHORIZED
+from typing import TYPE_CHECKING
+
+import jwt
+import pytest
+from fixtures.endpoint.http import COMPUTE_AUDIENCE, ComputeClaimsScope, EndpointHttpClient
+from fixtures.utils import run_only_on_default_postgres
+from requests import RequestException
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
+
+
+@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version")
+def test_compute_no_scope_claim(neon_simple_env: NeonEnv):
+    """
+    Test that if the JWT scope is not admin and no compute_id is specified,
+    the external HTTP server returns a 403 Forbidden error.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    # Encode nothing in the token
+    token = jwt.encode({}, env.auth_keys.priv, algorithm="EdDSA")
+
+    # Create an admin-scoped HTTP client
+    client = EndpointHttpClient(
+        external_port=endpoint.external_http_port,
+        internal_port=endpoint.internal_http_port,
+        jwt=token,
+    )
+
+    try:
+        client.status()
+        pytest.fail("Exception should have been raised")
+    except RequestException as e:
+        assert e.response is not None
+        assert e.response.status_code == FORBIDDEN
+
+
+@pytest.mark.parametrize(
+    "audience",
+    (COMPUTE_AUDIENCE, "invalid", None),
+    ids=["with_audience", "with_invalid_audience", "without_audience"],
+)
+@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version")
+def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | None):
+    """
+    Test that an admin-scoped JWT can access the compute's external HTTP server
+    without the compute_id being specified in the claims.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    data = {"scope": str(ComputeClaimsScope.ADMIN)}
+    if audience:
+        data["aud"] = audience
+
+    token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA")
+
+    # Create an admin-scoped HTTP client
+    client = EndpointHttpClient(
+        external_port=endpoint.external_http_port,
+        internal_port=endpoint.internal_http_port,
+        jwt=token,
+    )
+
+    try:
+        client.status()
+        if audience != COMPUTE_AUDIENCE:
+            pytest.fail("Exception should have been raised")
+    except RequestException as e:
+        assert e.response is not None
+        assert e.response.status_code == UNAUTHORIZED

From 384e3df2ad2c1561cc9d427793b2d54eaa3f137b Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Tue, 6 May 2025 17:52:15 -0400
Subject: [PATCH 57/76] fix: pinned anon extension to v2.1.0 (#11844)

## Problem

Currently the setup for `anon` v2 in the compute image downloads the
latest version of the extension. This can be problematic as on a compute
start/restart it can download a version that is newer than what we have
tested and potentially break things, hence not giving us the ability to
control when the extension is updated.

We were also using `v2.2.0`, which is not ready for production yet and
has been clarified by the maintainer.

Additional context:
https://gitlab.com/dalibo/postgresql_anonymizer/-/issues/530

## Summary of changes

Changed the URL from which we download the `anon` extension to point to
`v2.1.0` instead of `latest`.
---
 compute/compute-node.Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 8bdf5cb7d1..6233eaf709 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1351,7 +1351,8 @@ COPY compute/patches/anon_v2.patch .
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \
+RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/2.1.0/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \
+    echo "48e7f5ae2f1ca516df3da86c5c739d48dd780a4e885705704ccaad0faa89d6c0  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \
     sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \

From 5c356c63ebefa448ea521565df35d59f1c4a933b Mon Sep 17 00:00:00 2001
From: Mikhail <mikhail@neon.tech>
Date: Tue, 6 May 2025 23:02:12 +0100
Subject: [PATCH 58/76] endpoint_storage compute_ctl integration (#11550)

Add `/lfc/(prewarm|offload)` routes to `compute_ctl` which interact with
endpoint storage.

Add `prewarm_lfc_on_startup` spec option which, if enabled, downloads
LFC prewarm data on compute startup.

Resolves: https://github.com/neondatabase/cloud/issues/26343
---
 Cargo.lock                                   |   2 +
 Cargo.toml                                   |   1 +
 compute_tools/Cargo.toml                     |   1 +
 compute_tools/src/compute.rs                 |  49 ++++-
 compute_tools/src/compute_prewarm.rs         | 202 +++++++++++++++++++
 compute_tools/src/http/routes/lfc.rs         |  39 ++++
 compute_tools/src/http/routes/mod.rs         |   1 +
 compute_tools/src/http/server.rs             |   4 +-
 compute_tools/src/lib.rs                     |   1 +
 compute_tools/src/metrics.rs                 |  22 +-
 compute_tools/tests/pg_helpers_tests.rs      |   1 +
 control_plane/Cargo.toml                     |   2 +-
 control_plane/src/bin/neon_local.rs          |  19 +-
 control_plane/src/endpoint.rs                |   5 +
 control_plane/src/endpoint_storage.rs        |  10 +-
 control_plane/src/local_env.rs               |  16 +-
 endpoint_storage/src/app.rs                  |  12 +-
 endpoint_storage/src/claims.rs               |  52 +++++
 endpoint_storage/src/lib.rs                  |  85 +++-----
 libs/compute_api/src/responses.rs            |  24 +++
 libs/compute_api/src/spec.rs                 |   9 +
 libs/compute_api/tests/cluster_spec.json     |   5 +
 libs/utils/src/id.rs                         |   3 +
 test_runner/fixtures/endpoint/http.py        |  30 +++
 test_runner/fixtures/neon_fixtures.py        |   4 +-
 test_runner/regress/test_endpoint_storage.py |   3 +-
 test_runner/regress/test_lfc_prewarm.py      | 166 ++++++++++-----
 27 files changed, 630 insertions(+), 138 deletions(-)
 create mode 100644 compute_tools/src/compute_prewarm.rs
 create mode 100644 compute_tools/src/http/routes/lfc.rs
 create mode 100644 endpoint_storage/src/claims.rs

diff --git a/Cargo.lock b/Cargo.lock
index 4c464c62b8..fe4cc35029 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1284,6 +1284,7 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-compression",
  "aws-config",
  "aws-sdk-kms",
  "aws-sdk-s3",
@@ -1420,6 +1421,7 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
+ "endpoint_storage",
  "futures",
  "http-utils",
  "humantime",
diff --git a/Cargo.toml b/Cargo.toml
index 1c203af9e0..8d4cc4a75a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -243,6 +243,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
+endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver = { path = "./pageserver" }
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8c1e7ad149..8ee5dd0665 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -10,6 +10,7 @@ default = []
 testing = ["fail/failpoints"]
 
 [dependencies]
+async-compression.workspace = true
 base64.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 8834f0d63d..08d915b331 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,17 +1,10 @@
-use std::collections::HashMap;
-use std::os::unix::fs::{PermissionsExt, symlink};
-use std::path::Path;
-use std::process::{Command, Stdio};
-use std::str::FromStr;
-use std::sync::atomic::{AtomicU32, Ordering};
-use std::sync::{Arc, Condvar, Mutex, RwLock};
-use std::time::{Duration, Instant};
-use std::{env, fs};
-
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
+use compute_api::responses::{
+    ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
+    LfcPrewarmState,
+};
 use compute_api::spec::{
     ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
 };
@@ -25,6 +18,16 @@ use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
+use std::collections::HashMap;
+use std::net::SocketAddr;
+use std::os::unix::fs::{PermissionsExt, symlink};
+use std::path::Path;
+use std::process::{Command, Stdio};
+use std::str::FromStr;
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::{Arc, Condvar, Mutex, RwLock};
+use std::time::{Duration, Instant};
+use std::{env, fs};
 use tokio::spawn;
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -150,6 +153,9 @@ pub struct ComputeState {
     /// set up the span relationship ourselves.
     pub startup_span: Option<tracing::span::Span>,
 
+    pub lfc_prewarm_state: LfcPrewarmState,
+    pub lfc_offload_state: LfcOffloadState,
+
     pub metrics: ComputeMetrics,
 }
 
@@ -163,6 +169,8 @@ impl ComputeState {
             pspec: None,
             startup_span: None,
             metrics: ComputeMetrics::default(),
+            lfc_prewarm_state: LfcPrewarmState::default(),
+            lfc_offload_state: LfcOffloadState::default(),
         }
     }
 
@@ -198,6 +206,8 @@ pub struct ParsedSpec {
     pub pageserver_connstr: String,
     pub safekeeper_connstrings: Vec<String>,
     pub storage_auth_token: Option<String>,
+    pub endpoint_storage_addr: Option<SocketAddr>,
+    pub endpoint_storage_token: Option<String>,
 }
 
 impl TryFrom<ComputeSpec> for ParsedSpec {
@@ -251,6 +261,18 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
                 .or(Err("invalid timeline id"))?
         };
 
+        let endpoint_storage_addr: Option<SocketAddr> = spec
+            .endpoint_storage_addr
+            .clone()
+            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"))
+            .unwrap_or_default()
+            .parse()
+            .ok();
+        let endpoint_storage_token = spec
+            .endpoint_storage_token
+            .clone()
+            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token"));
+
         Ok(ParsedSpec {
             spec,
             pageserver_connstr,
@@ -258,6 +280,8 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
             storage_auth_token,
             tenant_id,
             timeline_id,
+            endpoint_storage_addr,
+            endpoint_storage_token,
         })
     }
 }
@@ -736,6 +760,9 @@ impl ComputeNode {
         // Log metrics so that we can search for slow operations in logs
         info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");
 
+        if pspec.spec.prewarm_lfc_on_startup {
+            self.prewarm_lfc();
+        }
         Ok(())
     }
 
diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs
new file mode 100644
index 0000000000..a6a84b3f1f
--- /dev/null
+++ b/compute_tools/src/compute_prewarm.rs
@@ -0,0 +1,202 @@
+use crate::compute::ComputeNode;
+use anyhow::{Context, Result, bail};
+use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
+use compute_api::responses::LfcOffloadState;
+use compute_api::responses::LfcPrewarmState;
+use http::StatusCode;
+use reqwest::Client;
+use std::sync::Arc;
+use tokio::{io::AsyncReadExt, spawn};
+use tracing::{error, info};
+
+#[derive(serde::Serialize, Default)]
+pub struct LfcPrewarmStateWithProgress {
+    #[serde(flatten)]
+    base: LfcPrewarmState,
+    total: i32,
+    prewarmed: i32,
+    skipped: i32,
+}
+
+/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks
+struct EndpointStoragePair {
+    url: String,
+    token: String,
+}
+
+const KEY: &str = "lfc_state";
+impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair {
+    type Error = anyhow::Error;
+    fn try_from(pspec: &crate::compute::ParsedSpec) -> Result<Self, Self::Error> {
+        let Some(ref endpoint_id) = pspec.spec.endpoint_id else {
+            bail!("pspec.endpoint_id missing")
+        };
+        let Some(ref base_uri) = pspec.endpoint_storage_addr else {
+            bail!("pspec.endpoint_storage_addr missing")
+        };
+        let tenant_id = pspec.tenant_id;
+        let timeline_id = pspec.timeline_id;
+
+        let url = format!("http://{base_uri}/{tenant_id}/{timeline_id}/{endpoint_id}/{KEY}");
+        let Some(ref token) = pspec.endpoint_storage_token else {
+            bail!("pspec.endpoint_storage_token missing")
+        };
+        let token = token.clone();
+        Ok(EndpointStoragePair { url, token })
+    }
+}
+
+impl ComputeNode {
+    // If prewarm failed, we want to get overall number of segments as well as done ones.
+    // However, this function should be reliable even if querying postgres failed.
+    pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress {
+        info!("requesting LFC prewarm state from postgres");
+        let mut state = LfcPrewarmStateWithProgress::default();
+        {
+            state.base = self.state.lock().unwrap().lfc_prewarm_state.clone();
+        }
+
+        let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await {
+            Ok(client) => client,
+            Err(err) => {
+                error!(%err, "connecting to postgres");
+                return state;
+            }
+        };
+        let row = match client
+            .query_one("select * from get_prewarm_info()", &[])
+            .await
+        {
+            Ok(row) => row,
+            Err(err) => {
+                error!(%err, "querying LFC prewarm status");
+                return state;
+            }
+        };
+        state.total = row.try_get(0).unwrap_or_default();
+        state.prewarmed = row.try_get(1).unwrap_or_default();
+        state.skipped = row.try_get(2).unwrap_or_default();
+        state
+    }
+
+    pub fn lfc_offload_state(&self) -> LfcOffloadState {
+        self.state.lock().unwrap().lfc_offload_state.clone()
+    }
+
+    /// Returns false if there is a prewarm request ongoing, true otherwise
+    pub fn prewarm_lfc(self: &Arc<Self>) -> bool {
+        crate::metrics::LFC_PREWARM_REQUESTS.inc();
+        {
+            let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
+            if let LfcPrewarmState::Prewarming =
+                std::mem::replace(state, LfcPrewarmState::Prewarming)
+            {
+                return false;
+            }
+        }
+
+        let cloned = self.clone();
+        spawn(async move {
+            let Err(err) = cloned.prewarm_impl().await else {
+                cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
+                return;
+            };
+            error!(%err);
+            cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
+                error: err.to_string(),
+            };
+        });
+        true
+    }
+
+    fn endpoint_storage_pair(&self) -> Result<EndpointStoragePair> {
+        let state = self.state.lock().unwrap();
+        state.pspec.as_ref().unwrap().try_into()
+    }
+
+    async fn prewarm_impl(&self) -> Result<()> {
+        let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
+        info!(%url, "requesting LFC state from endpoint storage");
+
+        let request = Client::new().get(&url).bearer_auth(token);
+        let res = request.send().await.context("querying endpoint storage")?;
+        let status = res.status();
+        if status != StatusCode::OK {
+            bail!("{status} querying endpoint storage")
+        }
+
+        let mut uncompressed = Vec::new();
+        let lfc_state = res
+            .bytes()
+            .await
+            .context("getting request body from endpoint storage")?;
+        ZstdDecoder::new(lfc_state.iter().as_slice())
+            .read_to_end(&mut uncompressed)
+            .await
+            .context("decoding LFC state")?;
+        let uncompressed_len = uncompressed.len();
+        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres");
+
+        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?
+            .query_one("select prewarm_local_cache($1)", &[&uncompressed])
+            .await
+            .context("loading LFC state into postgres")
+            .map(|_| ())
+    }
+
+    /// Returns false if there is an offload request ongoing, true otherwise
+    pub fn offload_lfc(self: &Arc<Self>) -> bool {
+        crate::metrics::LFC_OFFLOAD_REQUESTS.inc();
+        {
+            let state = &mut self.state.lock().unwrap().lfc_offload_state;
+            if let LfcOffloadState::Offloading =
+                std::mem::replace(state, LfcOffloadState::Offloading)
+            {
+                return false;
+            }
+        }
+
+        let cloned = self.clone();
+        spawn(async move {
+            let Err(err) = cloned.offload_lfc_impl().await else {
+                cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
+                return;
+            };
+            error!(%err);
+            cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
+                error: err.to_string(),
+            };
+        });
+        true
+    }
+
+    async fn offload_lfc_impl(&self) -> Result<()> {
+        let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
+        info!(%url, "requesting LFC state from postgres");
+
+        let mut compressed = Vec::new();
+        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?
+            .query_one("select get_local_cache_state()", &[])
+            .await
+            .context("querying LFC state")?
+            .try_get::<usize, &[u8]>(0)
+            .context("deserializing LFC state")
+            .map(ZstdEncoder::new)?
+            .read_to_end(&mut compressed)
+            .await
+            .context("compressing LFC state")?;
+        let compressed_len = compressed.len();
+        info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
+
+        let request = Client::new().put(url).bearer_auth(token).body(compressed);
+        match request.send().await {
+            Ok(res) if res.status() == StatusCode::OK => Ok(()),
+            Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()),
+            Err(err) => Err(err).context("writing to endpoint storage"),
+        }
+    }
+}
diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs
new file mode 100644
index 0000000000..07bcc6bfb7
--- /dev/null
+++ b/compute_tools/src/http/routes/lfc.rs
@@ -0,0 +1,39 @@
+use crate::compute_prewarm::LfcPrewarmStateWithProgress;
+use crate::http::JsonResponse;
+use axum::response::{IntoResponse, Response};
+use axum::{Json, http::StatusCode};
+use compute_api::responses::LfcOffloadState;
+type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;
+
+pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmStateWithProgress> {
+    Json(compute.lfc_prewarm_state().await)
+}
+
+// Following functions are marked async for axum, as it's more convenient than wrapping these
+// in async lambdas at call site
+
+pub(in crate::http) async fn offload_state(compute: Compute) -> Json<LfcOffloadState> {
+    Json(compute.lfc_offload_state())
+}
+
+pub(in crate::http) async fn prewarm(compute: Compute) -> Response {
+    if compute.prewarm_lfc() {
+        StatusCode::ACCEPTED.into_response()
+    } else {
+        JsonResponse::error(
+            StatusCode::TOO_MANY_REQUESTS,
+            "Multiple requests for prewarm are not allowed",
+        )
+    }
+}
+
+pub(in crate::http) async fn offload(compute: Compute) -> Response {
+    if compute.offload_lfc() {
+        StatusCode::ACCEPTED.into_response()
+    } else {
+        JsonResponse::error(
+            StatusCode::TOO_MANY_REQUESTS,
+            "Multiple requests for prewarm offload are not allowed",
+        )
+    }
+}
diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs
index a67be7fd5a..432e66a830 100644
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -11,6 +11,7 @@ pub(in crate::http) mod extensions;
 pub(in crate::http) mod failpoints;
 pub(in crate::http) mod grants;
 pub(in crate::http) mod insights;
+pub(in crate::http) mod lfc;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
 pub(in crate::http) mod status;
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 10f767e97c..d5d2427971 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -23,7 +23,7 @@ use super::{
     middleware::authorize::Authorize,
     routes::{
         check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-        grants, insights, metrics, metrics_json, status, terminate,
+        grants, insights, lfc, metrics, metrics_json, status, terminate,
     },
 };
 use crate::compute::ComputeNode;
@@ -85,6 +85,8 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                     Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
 
                 let authenticated_router = Router::<Arc<ComputeNode>>::new()
+                    .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
+                    .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
                     .route("/check_writability", post(check_writability::is_writable))
                     .route("/configure", post(configure::configure))
                     .route("/database_schema", get(database_schema::get_schema_dump))
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index a681fad0b0..7218067a8a 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod http;
 pub mod logger;
 pub mod catalog;
 pub mod compute;
+pub mod compute_prewarm;
 pub mod disk_quota;
 pub mod extension_server;
 pub mod installed_extensions;
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index e37d6120ac..90326b2074 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -1,7 +1,7 @@
 use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge};
 use metrics::proto::MetricFamily;
 use metrics::{
-    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
+    IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
     register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;
@@ -97,6 +97,24 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::
     .expect("failed to define a metric")
 });
 
+/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm.
+/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm
+pub(crate) static LFC_PREWARM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "compute_ctl_lfc_prewarm_requests_total",
+        "Total number of LFC prewarm requests made by compute_ctl",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "compute_ctl_lfc_offload_requests_total",
+        "Total number of LFC offload requests made by compute_ctl",
+    )
+    .expect("failed to define a metric")
+});
+
 pub fn collect() -> Vec<MetricFamily> {
     let mut metrics = COMPUTE_CTL_UP.collect();
     metrics.extend(INSTALLED_EXTENSIONS.collect());
@@ -106,5 +124,7 @@ pub fn collect() -> Vec<MetricFamily> {
     metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
     metrics.extend(PG_CURR_DOWNTIME_MS.collect());
     metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
+    metrics.extend(LFC_PREWARM_REQUESTS.collect());
+    metrics.extend(LFC_OFFLOAD_REQUESTS.collect());
     metrics
 }
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index b72c1293ee..53f2ddad84 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -30,6 +30,7 @@ mod pg_helpers_tests {
             r#"fsync = off
 wal_level = logical
 hot_standby = on
+prewarm_lfc_on_startup = off
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
 log_connections = on
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 92f0071bac..62c039047f 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -41,7 +41,7 @@ storage_broker.workspace = true
 http-utils.workspace = true
 utils.workspace = true
 whoami.workspace = true
-
+endpoint_storage.workspace = true
 compute_api.workspace = true
 workspace_hack.workspace = true
 tracing.workspace = true
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 44698f7b23..fd625e9ed6 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -20,7 +20,7 @@ use compute_api::requests::ComputeClaimsScope;
 use compute_api::spec::ComputeMode;
 use control_plane::broker::StorageBroker;
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage};
+use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
 use control_plane::local_env;
 use control_plane::local_env::{
     EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf,
@@ -1022,7 +1022,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                 })
                 .collect(),
             endpoint_storage: EndpointStorageConf {
-                port: ENDPOINT_STORAGE_DEFAULT_PORT,
+                listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR,
             },
             pg_distrib_dir: None,
             neon_distrib_dir: None,
@@ -1488,10 +1488,25 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                 None
             };
 
+            let exp = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?
+                + Duration::from_secs(86400))
+            .as_secs();
+            let claims = endpoint_storage::claims::EndpointStorageClaims {
+                tenant_id: endpoint.tenant_id,
+                timeline_id: endpoint.timeline_id,
+                endpoint_id: endpoint_id.to_string(),
+                exp,
+            };
+
+            let endpoint_storage_token = env.generate_auth_token(&claims)?;
+            let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string();
+
             println!("Starting existing endpoint {endpoint_id}...");
             endpoint
                 .start(
                     &auth_token,
+                    endpoint_storage_token,
+                    endpoint_storage_addr,
                     safekeepers_generation,
                     safekeepers,
                     pageservers,
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 0b16339a6f..fe6a93eb5e 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -650,6 +650,8 @@ impl Endpoint {
     pub async fn start(
         &self,
         auth_token: &Option<String>,
+        endpoint_storage_token: String,
+        endpoint_storage_addr: String,
         safekeepers_generation: Option<SafekeeperGeneration>,
         safekeepers: Vec<NodeId>,
         pageservers: Vec<(Host, u16)>,
@@ -743,6 +745,9 @@ impl Endpoint {
                 drop_subscriptions_before_start: self.drop_subscriptions_before_start,
                 audit_log_level: ComputeAudit::Disabled,
                 logs_export_host: None::<String>,
+                endpoint_storage_addr: Some(endpoint_storage_addr),
+                endpoint_storage_token: Some(endpoint_storage_token),
+                prewarm_lfc_on_startup: false,
             };
 
             // this strange code is needed to support respec() in tests
diff --git a/control_plane/src/endpoint_storage.rs b/control_plane/src/endpoint_storage.rs
index 102db91a22..171aaeddb4 100644
--- a/control_plane/src/endpoint_storage.rs
+++ b/control_plane/src/endpoint_storage.rs
@@ -3,17 +3,19 @@ use crate::local_env::LocalEnv;
 use anyhow::{Context, Result};
 use camino::Utf8PathBuf;
 use std::io::Write;
+use std::net::SocketAddr;
 use std::time::Duration;
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
 pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage";
-pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993;
+pub const ENDPOINT_STORAGE_DEFAULT_ADDR: SocketAddr =
+    SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), 9993);
 
 pub struct EndpointStorage {
     pub bin: Utf8PathBuf,
     pub data_dir: Utf8PathBuf,
     pub pemfile: Utf8PathBuf,
-    pub port: u16,
+    pub addr: SocketAddr,
 }
 
 impl EndpointStorage {
@@ -22,7 +24,7 @@ impl EndpointStorage {
             bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(),
             data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(),
             pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
-            port: env.endpoint_storage.port,
+            addr: env.endpoint_storage.listen_addr,
         }
     }
 
@@ -31,7 +33,7 @@ impl EndpointStorage {
     }
 
     fn listen_addr(&self) -> Utf8PathBuf {
-        format!("127.0.0.1:{}", self.port).into()
+        format!("{}:{}", self.addr.ip(), self.addr.port()).into()
     }
 
     pub fn init(&self) -> Result<()> {
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index a18b34daa4..4a8892c6de 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -20,7 +20,9 @@ use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 
 use crate::broker::StorageBroker;
-use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage};
+use crate::endpoint_storage::{
+    ENDPOINT_STORAGE_DEFAULT_ADDR, ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage,
+};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;
 
@@ -151,10 +153,10 @@ pub struct NeonLocalInitConf {
     pub generate_local_ssl_certs: bool,
 }
 
-#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct EndpointStorageConf {
-    pub port: u16,
+    pub listen_addr: SocketAddr,
 }
 
 /// Broker config for cluster internal communication.
@@ -241,6 +243,14 @@ impl Default for NeonStorageControllerConf {
     }
 }
 
+impl Default for EndpointStorageConf {
+    fn default() -> Self {
+        Self {
+            listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR,
+        }
+    }
+}
+
 impl NeonBroker {
     pub fn client_url(&self) -> Url {
         let url = if let Some(addr) = self.listen_https_addr {
diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs
index f07ef06328..0bd7fe5f28 100644
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -343,7 +343,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
     const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
     fn token() -> String {
-        let claims = endpoint_storage::Claims {
+        let claims = endpoint_storage::claims::EndpointStorageClaims {
             tenant_id: TENANT_ID,
             timeline_id: TIMELINE_ID,
             endpoint_id: ENDPOINT_ID.into(),
@@ -489,16 +489,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
     }
 
     fn delete_prefix_token(uri: &str) -> String {
-        use serde::Serialize;
         let parts = uri.split("/").collect::<Vec<&str>>();
-        #[derive(Serialize)]
-        struct PrefixClaims {
-            tenant_id: TenantId,
-            timeline_id: Option<TimelineId>,
-            endpoint_id: Option<endpoint_storage::EndpointId>,
-            exp: u64,
-        }
-        let claims = PrefixClaims {
+        let claims = endpoint_storage::claims::DeletePrefixClaims {
             tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
             timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
             endpoint_id: parts.get(3).map(ToString::to_string),
diff --git a/endpoint_storage/src/claims.rs b/endpoint_storage/src/claims.rs
new file mode 100644
index 0000000000..ef0f0eb0b4
--- /dev/null
+++ b/endpoint_storage/src/claims.rs
@@ -0,0 +1,52 @@
+use serde::{Deserialize, Serialize};
+use std::fmt::Display;
+use utils::id::{EndpointId, TenantId, TimelineId};
+
+/// Claims to add, remove, or retrieve endpoint data. Used by compute_ctl
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct EndpointStorageClaims {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub endpoint_id: EndpointId,
+    pub exp: u64,
+}
+
+/// Claims to remove tenant, timeline, or endpoint data. Used by control plane
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct DeletePrefixClaims {
+    pub tenant_id: TenantId,
+    /// None when tenant is deleted (endpoint_id is also None in this case)
+    pub timeline_id: Option<TimelineId>,
+    /// None when timeline is deleted
+    pub endpoint_id: Option<EndpointId>,
+    pub exp: u64,
+}
+
+impl Display for EndpointStorageClaims {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "EndpointClaims(tenant_id={} timeline_id={} endpoint_id={} exp={})",
+            self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
+        )
+    }
+}
+
+impl Display for DeletePrefixClaims {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "DeletePrefixClaims(tenant_id={} timeline_id={} endpoint_id={}, exp={})",
+            self.tenant_id,
+            self.timeline_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string()),
+            self.endpoint_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string()),
+            self.exp
+        )
+    }
+}
diff --git a/endpoint_storage/src/lib.rs b/endpoint_storage/src/lib.rs
index eb6b80c487..d1625dc843 100644
--- a/endpoint_storage/src/lib.rs
+++ b/endpoint_storage/src/lib.rs
@@ -1,3 +1,5 @@
+pub mod claims;
+use crate::claims::{DeletePrefixClaims, EndpointStorageClaims};
 use anyhow::Result;
 use axum::extract::{FromRequestParts, Path};
 use axum::response::{IntoResponse, Response};
@@ -13,7 +15,7 @@ use std::result::Result as StdResult;
 use std::sync::Arc;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error};
-use utils::id::{TenantId, TimelineId};
+use utils::id::{EndpointId, TenantId, TimelineId};
 
 // simplified version of utils::auth::JwtAuth
 pub struct JwtAuth {
@@ -79,26 +81,6 @@ pub struct Storage {
     pub max_upload_file_limit: usize,
 }
 
-pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
-
-#[derive(Deserialize, Serialize, PartialEq)]
-pub struct Claims {
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub endpoint_id: EndpointId,
-    pub exp: u64,
-}
-
-impl Display for Claims {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
-            self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
-        )
-    }
-}
-
 #[derive(Deserialize, Serialize)]
 struct KeyRequest {
     tenant_id: TenantId,
@@ -107,6 +89,13 @@ struct KeyRequest {
     path: String,
 }
 
+#[derive(Deserialize, Serialize, PartialEq)]
+struct PrefixKeyRequest {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    endpoint_id: Option<EndpointId>,
+}
+
 #[derive(Debug, PartialEq)]
 pub struct S3Path {
     pub path: RemotePath,
@@ -165,7 +154,7 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
             .extract::<TypedHeader<Authorization<Bearer>>>()
             .await
             .map_err(|e| bad_request(e, "invalid token"))?;
-        let claims: Claims = state
+        let claims: EndpointStorageClaims = state
             .auth
             .decode(bearer.token())
             .map_err(|e| bad_request(e, "decoding token"))?;
@@ -178,7 +167,7 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
             path.endpoint_id.clone()
         };
 
-        let route = Claims {
+        let route = EndpointStorageClaims {
             tenant_id: path.tenant_id,
             timeline_id: path.timeline_id,
             endpoint_id,
@@ -193,38 +182,13 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
     }
 }
 
-#[derive(Deserialize, Serialize, PartialEq)]
-pub struct PrefixKeyPath {
-    pub tenant_id: TenantId,
-    pub timeline_id: Option<TimelineId>,
-    pub endpoint_id: Option<EndpointId>,
-}
-
-impl Display for PrefixKeyPath {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
-            self.tenant_id,
-            self.timeline_id
-                .as_ref()
-                .map(ToString::to_string)
-                .unwrap_or("".to_string()),
-            self.endpoint_id
-                .as_ref()
-                .map(ToString::to_string)
-                .unwrap_or("".to_string())
-        )
-    }
-}
-
 #[derive(Debug, PartialEq)]
 pub struct PrefixS3Path {
     pub path: RemotePath,
 }
 
-impl From<&PrefixKeyPath> for PrefixS3Path {
-    fn from(path: &PrefixKeyPath) -> Self {
+impl From<&DeletePrefixClaims> for PrefixS3Path {
+    fn from(path: &DeletePrefixClaims) -> Self {
         let timeline_id = path
             .timeline_id
             .as_ref()
@@ -250,21 +214,27 @@ impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
         state: &Arc<Storage>,
     ) -> Result<Self, Self::Rejection> {
         let Path(path) = parts
-            .extract::<Path<PrefixKeyPath>>()
+            .extract::<Path<PrefixKeyRequest>>()
             .await
             .map_err(|e| bad_request(e, "invalid route"))?;
         let TypedHeader(Authorization(bearer)) = parts
             .extract::<TypedHeader<Authorization<Bearer>>>()
             .await
             .map_err(|e| bad_request(e, "invalid token"))?;
-        let claims: PrefixKeyPath = state
+        let claims: DeletePrefixClaims = state
             .auth
             .decode(bearer.token())
             .map_err(|e| bad_request(e, "invalid token"))?;
-        if path != claims {
-            return Err(unauthorized(path, claims));
+        let route = DeletePrefixClaims {
+            tenant_id: path.tenant_id,
+            timeline_id: path.timeline_id,
+            endpoint_id: path.endpoint_id,
+            exp: claims.exp,
+        };
+        if route != claims {
+            return Err(unauthorized(route, claims));
         }
-        Ok((&path).into())
+        Ok((&route).into())
     }
 }
 
@@ -297,7 +267,7 @@ mod tests {
 
     #[test]
     fn s3_path() {
-        let auth = Claims {
+        let auth = EndpointStorageClaims {
             tenant_id: TENANT_ID,
             timeline_id: TIMELINE_ID,
             endpoint_id: ENDPOINT_ID.into(),
@@ -327,10 +297,11 @@ mod tests {
 
     #[test]
     fn prefix_s3_path() {
-        let mut path = PrefixKeyPath {
+        let mut path = DeletePrefixClaims {
             tenant_id: TENANT_ID,
             timeline_id: None,
             endpoint_id: None,
+            exp: 0,
         };
         let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
         assert_eq!(
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index b7d6b7ca34..24d371c6eb 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,6 +46,30 @@ pub struct ExtensionInstallResponse {
     pub version: ExtVersion,
 }
 
+#[derive(Serialize, Default, Debug, Clone)]
+#[serde(tag = "status", rename_all = "snake_case")]
+pub enum LfcPrewarmState {
+    #[default]
+    NotPrewarmed,
+    Prewarming,
+    Completed,
+    Failed {
+        error: String,
+    },
+}
+
+#[derive(Serialize, Default, Debug, Clone)]
+#[serde(tag = "status", rename_all = "snake_case")]
+pub enum LfcOffloadState {
+    #[default]
+    NotOffloaded,
+    Offloading,
+    Completed,
+    Failed {
+        error: String,
+    },
+}
+
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index ad246c48ec..09b550b96c 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -172,6 +172,15 @@ pub struct ComputeSpec {
     /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
     /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
     pub logs_export_host: Option<String>,
+
+    /// Address of endpoint storage service
+    pub endpoint_storage_addr: Option<String>,
+    /// JWT for authorizing requests to endpoint storage service
+    pub endpoint_storage_token: Option<String>,
+
+    /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup
+    #[serde(default)]
+    pub prewarm_lfc_on_startup: bool,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json
index 37de24be5b..30e788a601 100644
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -84,6 +84,11 @@
                 "value": "on",
                 "vartype": "bool"
             },
+            {
+                "name": "prewarm_lfc_on_startup",
+                "value": "off",
+                "vartype": "bool"
+            },
             {
                 "name": "neon.safekeepers",
                 "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index 6016c23a01..68cb1f0209 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -295,6 +295,9 @@ pub struct TenantId(Id);
 
 id_newtype!(TenantId);
 
+/// If needed, reuse small string from proxy/src/types.rc
+pub type EndpointId = String;
+
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index beed1dcd93..4b4b98aa6c 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -10,6 +10,7 @@ from requests.auth import AuthBase
 from typing_extensions import override
 
 from fixtures.log_helper import log
+from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
     from requests import PreparedRequest
@@ -62,6 +63,35 @@ class EndpointHttpClient(requests.Session):
         res.raise_for_status()
         return res.json()
 
+    def prewarm_lfc_status(self) -> dict[str, str]:
+        res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm")
+        res.raise_for_status()
+        json: dict[str, str] = res.json()
+        return json
+
+    def prewarm_lfc(self):
+        self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status()
+
+        def prewarmed():
+            json = self.prewarm_lfc_status()
+            status, err = json["status"], json.get("error")
+            assert status == "completed", f"{status}, error {err}"
+
+        wait_until(prewarmed)
+
+    def offload_lfc(self):
+        url = f"http://localhost:{self.external_port}/lfc/offload"
+        self.post(url).raise_for_status()
+
+        def offloaded():
+            res = self.get(url)
+            res.raise_for_status()
+            json = res.json()
+            status, err = json["status"], json.get("error")
+            assert status == "completed", f"{status}, error {err}"
+
+        wait_until(offloaded)
+
     def database_schema(self, database: str):
         res = self.get(
             f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 133be5c045..d4a750ad3b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1185,7 +1185,9 @@ class NeonEnv:
             "broker": {},
             "safekeepers": [],
             "pageservers": [],
-            "endpoint_storage": {"port": self.port_distributor.get_port()},
+            "endpoint_storage": {
+                "listen_addr": f"127.0.0.1:{self.port_distributor.get_port()}",
+            },
             "generate_local_ssl_certs": self.generate_local_ssl_certs,
         }
 
diff --git a/test_runner/regress/test_endpoint_storage.py b/test_runner/regress/test_endpoint_storage.py
index 04029114ec..1e27ef4b14 100644
--- a/test_runner/regress/test_endpoint_storage.py
+++ b/test_runner/regress/test_endpoint_storage.py
@@ -4,10 +4,12 @@ import pytest
 from aiohttp import ClientSession
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import run_only_on_default_postgres
 from jwcrypto import jwk, jwt
 
 
 @pytest.mark.asyncio
+@run_only_on_default_postgres("test doesn't use postgres")
 async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv):
     """
     Inserts, retrieves, and deletes test file using a JWT token
@@ -35,7 +37,6 @@ async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv)
     key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key"
     headers = {"Authorization": f"Bearer {token}"}
     log.info(f"cache key url {key}")
-    log.info(f"token {token}")
 
     async with ClientSession(headers=headers) as session:
         async with session.get(key) as res:
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
index dd0ae1921d..82e1e9fcba 100644
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,11 +1,24 @@
 import random
 import threading
 import time
+from enum import Enum
 
 import pytest
+from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import USE_LFC
+from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl
+
+
+class LfcQueryMethod(Enum):
+    COMPUTE_CTL = False
+    POSTGRES = True
+
+
+PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total"
+OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total"
+QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL
 
 
 def check_pinned_entries(cur):
@@ -19,11 +32,20 @@ def check_pinned_entries(cur):
     assert n_pinned == 0
 
 
+def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
+    return {
+        sample.name: sample.value
+        for family in prom_parse_impl(client.metrics())
+        for sample in family.samples
+        if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL)
+    }
+
+
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-def test_lfc_prewarm(neon_simple_env: NeonEnv):
+@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
+def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
     env = neon_simple_env
     n_records = 1000000
-
     endpoint = env.endpoints.create_start(
         branch_name="main",
         config_lines=[
@@ -34,30 +56,57 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv):
             "neon.file_cache_prewarm_limit=1000",
         ],
     )
-    conn = endpoint.connect()
-    cur = conn.cursor()
-    cur.execute("create extension neon version '1.6'")
-    cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))")
-    cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
-    cur.execute("select get_local_cache_state()")
-    lfc_state = cur.fetchall()[0][0]
+
+    pg_conn = endpoint.connect()
+    pg_cur = pg_conn.cursor()
+    pg_cur.execute("create extension neon version '1.6'")
+    pg_cur.execute("create database lfc")
+
+    lfc_conn = endpoint.connect(dbname="lfc")
+    lfc_cur = lfc_conn.cursor()
+    log.info(f"Inserting {n_records} rows")
+    lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))")
+    lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
+    log.info(f"Inserted {n_records} rows")
+
+    http_client = endpoint.http_client()
+    if query is LfcQueryMethod.COMPUTE_CTL:
+        status = http_client.prewarm_lfc_status()
+        assert status["status"] == "not_prewarmed"
+        assert "error" not in status
+        http_client.offload_lfc()
+        assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed"
+        assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
+    else:
+        pg_cur.execute("select get_local_cache_state()")
+        lfc_state = pg_cur.fetchall()[0][0]
 
     endpoint.stop()
     endpoint.start()
 
-    conn = endpoint.connect()
-    cur = conn.cursor()
-    time.sleep(1)  # wait until compute_ctl complete downgrade of extension to default version
-    cur.execute("alter extension neon update to '1.6'")
-    cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+    # wait until compute_ctl completes downgrade of extension to default version
+    time.sleep(1)
+    pg_conn = endpoint.connect()
+    pg_cur = pg_conn.cursor()
+    pg_cur.execute("alter extension neon update to '1.6'")
 
-    cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
-    lfc_used_pages = cur.fetchall()[0][0]
+    lfc_conn = endpoint.connect(dbname="lfc")
+    lfc_cur = lfc_conn.cursor()
+
+    if query is LfcQueryMethod.COMPUTE_CTL:
+        http_client.prewarm_lfc()
+    else:
+        pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+
+    pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
+    lfc_used_pages = pg_cur.fetchall()[0][0]
     log.info(f"Used LFC size: {lfc_used_pages}")
-    cur.execute("select * from get_prewarm_info()")
-    prewarm_info = cur.fetchall()[0]
+    pg_cur.execute("select * from get_prewarm_info()")
+    prewarm_info = pg_cur.fetchall()[0]
     log.info(f"Prewarm info: {prewarm_info}")
-    log.info(f"Prewarm progress: {(prewarm_info[1] + prewarm_info[2]) * 100 // prewarm_info[0]}%")
+    total, prewarmed, skipped, _ = prewarm_info
+    progress = (prewarmed + skipped) * 100 // total
+    log.info(f"Prewarm progress: {progress}%")
 
     assert lfc_used_pages > 10000
     assert (
@@ -66,18 +115,23 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv):
         and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
     )
 
-    cur.execute("select sum(pk) from t")
-    assert cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
+    lfc_cur.execute("select sum(pk) from t")
+    assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
 
-    check_pinned_entries(cur)
+    check_pinned_entries(pg_cur)
+
+    desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
+    if query is LfcQueryMethod.COMPUTE_CTL:
+        assert http_client.prewarm_lfc_status() == desired
+        assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
 
 
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv):
+@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"])
+def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod):
     env = neon_simple_env
     n_records = 10000
     n_threads = 4
-
     endpoint = env.endpoints.create_start(
         branch_name="main",
         config_lines=[
@@ -87,40 +141,58 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv):
             "neon.file_cache_prewarm_limit=1000000",
         ],
     )
-    conn = endpoint.connect()
-    cur = conn.cursor()
-    cur.execute("create extension neon version '1.6'")
-    cur.execute(
+
+    pg_conn = endpoint.connect()
+    pg_cur = pg_conn.cursor()
+    pg_cur.execute("create extension neon version '1.6'")
+    pg_cur.execute("CREATE DATABASE lfc")
+
+    lfc_conn = endpoint.connect(dbname="lfc")
+    lfc_cur = lfc_conn.cursor()
+    lfc_cur.execute(
         "create table accounts(id integer primary key, balance bigint default 0, payload text default repeat('?', 1000)) with (fillfactor=10)"
     )
-    cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))")
-    cur.execute("select get_local_cache_state()")
-    lfc_state = cur.fetchall()[0][0]
+    log.info(f"Inserting {n_records} rows")
+    lfc_cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))")
+    log.info(f"Inserted {n_records} rows")
+
+    http_client = endpoint.http_client()
+    if query is LfcQueryMethod.COMPUTE_CTL:
+        http_client.offload_lfc()
+    else:
+        pg_cur.execute("select get_local_cache_state()")
+        lfc_state = pg_cur.fetchall()[0][0]
 
     running = True
+    n_prewarms = 0
 
     def workload():
-        conn = endpoint.connect()
-        cur = conn.cursor()
+        lfc_conn = endpoint.connect(dbname="lfc")
+        lfc_cur = lfc_conn.cursor()
         n_transfers = 0
         while running:
             src = random.randint(1, n_records)
             dst = random.randint(1, n_records)
-            cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
-            cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
+            lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,))
+            lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,))
             n_transfers += 1
         log.info(f"Number of transfers: {n_transfers}")
 
     def prewarm():
-        conn = endpoint.connect()
-        cur = conn.cursor()
-        n_prewarms = 0
+        pg_conn = endpoint.connect()
+        pg_cur = pg_conn.cursor()
         while running:
-            cur.execute("alter system set neon.file_cache_size_limit='1MB'")
-            cur.execute("select pg_reload_conf()")
-            cur.execute("alter system set neon.file_cache_size_limit='1GB'")
-            cur.execute("select pg_reload_conf()")
-            cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+            pg_cur.execute("alter system set neon.file_cache_size_limit='1MB'")
+            pg_cur.execute("select pg_reload_conf()")
+            pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'")
+            pg_cur.execute("select pg_reload_conf()")
+
+            if query is LfcQueryMethod.COMPUTE_CTL:
+                http_client.prewarm_lfc()
+            else:
+                pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+
+            nonlocal n_prewarms
             n_prewarms += 1
         log.info(f"Number of prewarms: {n_prewarms}")
 
@@ -140,8 +212,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv):
         t.join()
     prewarm_thread.join()
 
-    cur.execute("select sum(balance) from accounts")
-    total_balance = cur.fetchall()[0][0]
+    lfc_cur.execute("select sum(balance) from accounts")
+    total_balance = lfc_cur.fetchall()[0][0]
     assert total_balance == 0
 
-    check_pinned_entries(cur)
+    check_pinned_entries(pg_cur)
+    if query is LfcQueryMethod.COMPUTE_CTL:
+        assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}

From 0ef6851219c58d52263fd96cb87c18e72a928fd2 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 May 2025 17:19:15 -0500
Subject: [PATCH 59/76] Make the audience claim in compute JWTs a vector
 (#11845)

According to RFC 7519, `aud` is generally an array of StringOrURI, but
in special cases may be a single StringOrURI value. To accomodate future
control plane work where a single token may work for multiple services,
make the claim a vector.

Link: https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/middleware/authorize.rs | 2 +-
 control_plane/src/endpoint.rs                  | 4 ++--
 libs/compute_api/src/requests.rs               | 7 +++++--
 test_runner/regress/test_compute_http.py       | 4 ++--
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index 2afc57ad9c..a82f46e062 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -79,7 +79,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
                         ));
                     };
 
-                    if audience != COMPUTE_AUDIENCE {
+                    if !audience.iter().any(|a| a == COMPUTE_AUDIENCE) {
                         return Err(JsonResponse::error(
                             StatusCode::UNAUTHORIZED,
                             "invalid audience in authorization token claims",
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index fe6a93eb5e..be73661a3c 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -635,8 +635,8 @@ impl Endpoint {
     pub fn generate_jwt(&self, scope: Option<ComputeClaimsScope>) -> Result<String> {
         self.env.generate_auth_token(&ComputeClaims {
             audience: match scope {
-                Some(ComputeClaimsScope::Admin) => Some(COMPUTE_AUDIENCE.to_owned()),
-                _ => Some(self.endpoint_id.clone()),
+                Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]),
+                _ => None,
             },
             compute_id: match scope {
                 Some(ComputeClaimsScope::Admin) => None,
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index 40d34eccea..bbab271474 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -10,9 +10,9 @@ use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
 /// The value to place in the [`ComputeClaims::audience`] claim.
 pub static COMPUTE_AUDIENCE: &str = "compute";
 
+/// Available scopes for a compute's JWT.
 #[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 #[serde(rename_all = "snake_case")]
-/// Available scopes for a compute's JWT.
 pub enum ComputeClaimsScope {
     /// An admin-scoped token allows access to all of `compute_ctl`'s authorized
     /// facilities.
@@ -48,8 +48,11 @@ pub struct ComputeClaims {
     ///
     /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for
     /// more information.
+    ///
+    /// TODO: Remove the [`Option`] wrapper when control plane learns to send
+    /// the claim.
     #[serde(rename = "aud")]
-    pub audience: Option<String>,
+    pub audience: Option<Vec<String>>,
 }
 
 /// Request of the /configure API
diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py
index ce31ff0fe6..9846d44ce2 100644
--- a/test_runner/regress/test_compute_http.py
+++ b/test_runner/regress/test_compute_http.py
@@ -56,9 +56,9 @@ def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | Non
 
     endpoint = env.endpoints.create_start("main")
 
-    data = {"scope": str(ComputeClaimsScope.ADMIN)}
+    data: dict[str, str | list[str]] = {"scope": str(ComputeClaimsScope.ADMIN)}
     if audience:
-        data["aud"] = audience
+        data["aud"] = [audience]
 
     token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA")
 

From 608afc3055fe1acb32caba2f7b94e01db30f12f2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 7 May 2025 17:21:17 +0800
Subject: [PATCH 60/76] fix(scrubber): log download error (#11833)

## Problem

We use `head_object` to determine whether an object exists or not.
However, it does not always error due to a missing object.

## Summary of changes

Log the error so that we can have a better idea what's going on with the
scrubber errors in prod.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs                   |  5 +++--
 storage_scrubber/src/pageserver_physical_gc.rs   | 11 +++++++----
 storage_scrubber/src/scan_pageserver_metadata.rs |  5 ++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index f0ba632fd4..b151b612bf 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -165,16 +165,17 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                                 .head_object(&path, &CancellationToken::new())
                                 .await;
 
-                            if response.is_err() {
+                            if let Err(e) = response {
                                 // Object is not present.
                                 let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
 
                                 let msg = format!(
-                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
+                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}",
                                     layer,
                                     metadata.generation.get_suffix(),
                                     metadata.shard,
                                     is_l0,
+                                    e,
                                 );
 
                                 if is_l0 || ignore_error {
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index f14341c7bc..e1a4095a3c 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -137,11 +137,10 @@ struct TenantRefAccumulator {
 impl TenantRefAccumulator {
     fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) {
         let this_shard_idx = ttid.tenant_shard_id.to_index();
-        (*self
-            .shards_seen
+        self.shards_seen
             .entry(ttid.tenant_shard_id.tenant_id)
-            .or_default())
-        .insert(this_shard_idx);
+            .or_default()
+            .insert(this_shard_idx);
 
         let mut ancestor_refs = Vec::new();
         for (layer_name, layer_metadata) in &index_part.layer_metadata {
@@ -767,10 +766,13 @@ pub async fn pageserver_physical_gc(
                 stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?,
             );
             Ok(try_stream! {
+                let mut cnt = 0;
                 while let Some(ttid_res) = timelines.next().await {
                     let ttid = ttid_res?;
+                    cnt += 1;
                     yield (ttid, tenant_manifest_arc.clone());
                 }
+                tracing::info!(%tenant_shard_id, "Found {} timelines", cnt);
             })
         }
     });
@@ -790,6 +792,7 @@ pub async fn pageserver_physical_gc(
                 &accumulator,
                 tenant_manifest_arc,
             )
+            .instrument(info_span!("gc_timeline", %ttid))
         });
         let timelines = timelines.try_buffered(CONCURRENCY);
         let mut timelines = std::pin::pin!(timelines);
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index ba75f25984..77c7987aa7 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -153,7 +153,10 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| {
+        tracing::info!("Found tenant: {}", t);
+        stream_tenant_timelines(&remote_client, &target, t)
+    });
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 

From 3cf5e1386c3071994281b4bc7a1579e4595689f6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 7 May 2025 11:13:26 +0100
Subject: [PATCH 61/76] pageserver: fix rough edges of pageserver tracing
 (#11842)

## Problem

There's a few rough edges around PS tracing.

## Summary of changes

* include compute request id in pageserver trace
* use the get page specific context for GET_REL_SIZE and GET_BATCH
* fix assertion in download layer trace


![image](https://github.com/user-attachments/assets/2ff6779c-7c2d-4102-8013-ada8203aa42f)
---
 pageserver/src/page_service.rs               |  6 +-
 pageserver/src/pgdatadir_mapping.rs          | 64 +++++++++++---------
 pageserver/src/tenant/storage_layer/layer.rs | 35 +++++------
 3 files changed, 55 insertions(+), 50 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 0ce1a99681..bca1cb5b49 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1038,21 +1038,23 @@ impl PageServerHandler {
                         tracing::info_span!(
                             parent: &parent_span,
                             "handle_get_page_request",
+                            request_id = %req.hdr.reqid,
                             rel = %req.rel,
                             blkno = %req.blkno,
                             req_lsn = %req.hdr.request_lsn,
-                            not_modified_since_lsn = %req.hdr.not_modified_since
+                            not_modified_since_lsn = %req.hdr.not_modified_since,
                         )
                     }};
                     ($shard_id:expr) => {{
                         tracing::info_span!(
                             parent: &parent_span,
                             "handle_get_page_request",
+                            request_id = %req.hdr.reqid,
                             rel = %req.rel,
                             blkno = %req.blkno,
                             req_lsn = %req.hdr.request_lsn,
                             not_modified_since_lsn = %req.hdr.not_modified_since,
-                            shard_id = %$shard_id
+                            shard_id = %$shard_id,
                         )
                     }};
                 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ccb48d8bc1..d770946580 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
-use crate::context::{PerfInstrumentFutureExt, RequestContext};
+use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
     RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
@@ -275,24 +275,30 @@ impl Timeline {
                 continue;
             }
 
-            let nblocks = match self
-                .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
-                .maybe_perf_instrument(&ctx, |crnt_perf_span| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        parent: crnt_perf_span,
-                        "GET_REL_SIZE",
-                        reltag=%tag,
-                        lsn=%lsn,
-                    )
-                })
-                .await
-            {
-                Ok(nblocks) => nblocks,
-                Err(err) => {
-                    result_slots[response_slot_idx].write(Err(err));
-                    slots_filled += 1;
-                    continue;
+            let nblocks = {
+                let ctx = RequestContextBuilder::from(&ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "GET_REL_SIZE",
+                            reltag=%tag,
+                            lsn=%lsn,
+                        )
+                    })
+                    .attached_child();
+
+                match self
+                    .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
+                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
+                    .await
+                {
+                    Ok(nblocks) => nblocks,
+                    Err(err) => {
+                        result_slots[response_slot_idx].write(Err(err));
+                        slots_filled += 1;
+                        continue;
+                    }
                 }
             };
 
@@ -308,6 +314,17 @@ impl Timeline {
 
             let key = rel_block_to_key(*tag, *blknum);
 
+            let ctx = RequestContextBuilder::from(&ctx)
+                .perf_span(|crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "GET_BATCH",
+                        batch_size = %page_count,
+                    )
+                })
+                .attached_child();
+
             let key_slots = keys_slots.entry(key).or_default();
             key_slots.push((response_slot_idx, ctx));
 
@@ -323,14 +340,7 @@ impl Timeline {
         let query = VersionedKeySpaceQuery::scattered(query);
         let res = self
             .get_vectored(query, io_concurrency, ctx)
-            .maybe_perf_instrument(ctx, |current_perf_span| {
-                info_span!(
-                    target: PERF_TRACE_TARGET,
-                    parent: current_perf_span,
-                    "GET_BATCH",
-                    batch_size = %page_count,
-                )
-            })
+            .maybe_perf_instrument(ctx, |current_perf_span| current_perf_span.clone())
             .await;
 
         match res {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 50810cb154..3d55972017 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -23,7 +23,7 @@ use super::{
     LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState,
 };
 use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
+use crate::context::{RequestContext, RequestContextBuilder};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -1076,24 +1076,17 @@ impl LayerInner {
             return Err(DownloadError::DownloadRequired);
         }
 
-        let ctx = if ctx.has_perf_span() {
-            let dl_ctx = RequestContextBuilder::from(ctx)
-                .task_kind(TaskKind::LayerDownload)
-                .download_behavior(DownloadBehavior::Download)
-                .root_perf_span(|| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        "DOWNLOAD_LAYER",
-                        layer = %self,
-                        reason = %reason
-                    )
-                })
-                .detached_child();
-            ctx.perf_follows_from(&dl_ctx);
-            dl_ctx
-        } else {
-            ctx.attached_child()
-        };
+        let ctx = RequestContextBuilder::from(ctx)
+            .perf_span(|crnt_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: crnt_perf_span,
+                    "DOWNLOAD_LAYER",
+                    layer = %self,
+                    reason = %reason,
+                )
+            })
+            .attached_child();
 
         async move {
             tracing::info!(%reason, "downloading on-demand");
@@ -1101,7 +1094,7 @@ impl LayerInner {
             let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
             let res = self
                 .download_init_and_wait(timeline, permit, ctx.attached_child())
-                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
+                .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
                 .await?;
 
             scopeguard::ScopeGuard::into_inner(init_cancelled);
@@ -1709,7 +1702,7 @@ impl DownloadError {
     }
 }
 
-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Copy, Clone)]
 pub(crate) enum NeedsDownload {
     NotFound,
     NotFile(std::fs::FileType),

From 0691b73f53580687f0a5aee8b1e3a3192faa7707 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 7 May 2025 14:14:24 +0200
Subject: [PATCH 62/76] fix(compute): Enforce cloud_admin role in compute_ctl
 connections (#11827)

## Problem

Users can override some configuration parameters on the DB level with
`ALTER DATABASE ... SET ...`. Some of these overrides, like `role` or
`default_transaction_read_only`, affect `compute_ctl`'s ability to
configure the DB schema properly.

## Summary of changes

Enforce `role=cloud_admin`, `statement_timeout=0`, and move
`default_transaction_read_only=off` override from control plane [1] to
`compute_ctl`. Also, enforce `search_path=public` just in case, although
we do not call any functions in user databases.

[1]:
https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70

Fixes https://github.com/neondatabase/cloud/issues/28532
---
 compute_tools/src/compute.rs                | 45 +++++++++++---
 test_runner/regress/test_compute_catalog.py | 66 +++++++++++++++++++++
 2 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 08d915b331..0cda36a6e2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -329,11 +329,39 @@ struct StartVmMonitorResult {
 impl ComputeNode {
     pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
         let connstr = params.connstr.as_str();
-        let conn_conf = postgres::config::Config::from_str(connstr)
+        let mut conn_conf = postgres::config::Config::from_str(connstr)
             .context("cannot build postgres config from connstr")?;
-        let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr)
+        let mut tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr)
             .context("cannot build tokio postgres config from connstr")?;
 
+        // Users can set some configuration parameters per database with
+        //   ALTER DATABASE ... SET ...
+        //
+        // There are at least these parameters:
+        //
+        //   - role=some_other_role
+        //   - default_transaction_read_only=on
+        //   - statement_timeout=1, i.e., 1ms, which will cause most of the queries to fail
+        //   - search_path=non_public_schema, this should be actually safe because
+        //     we don't call any functions in user databases, but better to always reset
+        //     it to public.
+        //
+        // that can affect `compute_ctl` and prevent it from properly configuring the database schema.
+        // Unset them via connection string options before connecting to the database.
+        // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
+        //
+        // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane
+        // as well. After rolling out this code, we can remove this parameter from control plane.
+        // In the meantime, double-passing is fine, the last value is applied.
+        // See: <https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70>
+        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
+        let options = match conn_conf.get_options() {
+            Some(options) => format!("{} {}", options, EXTRA_OPTIONS),
+            None => EXTRA_OPTIONS.to_string(),
+        };
+        conn_conf.options(&options);
+        tokio_conn_conf.options(&options);
+
         let mut new_state = ComputeState::new();
         if let Some(spec) = config.spec {
             let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
@@ -1449,15 +1477,20 @@ impl ComputeNode {
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
                 | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
-                    // Connect with zenith_admin if cloud_admin could not authenticate
+                    // Connect with `zenith_admin` if `cloud_admin` could not authenticate
                     info!(
-                        "cannot connect to postgres: {}, retrying with `zenith_admin` username",
+                        "cannot connect to Postgres: {}, retrying with 'zenith_admin' username",
                         e
                     );
                     let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
                     zenith_admin_conf.application_name("compute_ctl:apply_config");
                     zenith_admin_conf.user("zenith_admin");
 
+                    // It doesn't matter what were the options before, here we just want
+                    // to connect and create a new superuser role.
+                    const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
+                    zenith_admin_conf.options(ZENITH_OPTIONS);
+
                     let mut client =
                         zenith_admin_conf.connect(NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
@@ -1623,9 +1656,7 @@ impl ComputeNode {
                 self.pg_reload_conf()?;
 
                 if spec.mode == ComputeMode::Primary {
-                    let mut conf =
-                        tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap();
-                    conf.application_name("apply_config");
+                    let conf = self.get_tokio_conn_conf(Some("compute_ctl:reconfigure"));
                     let conf = Arc::new(conf);
 
                     let spec = Arc::new(spec.clone());
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index 37208c9fff..b66b326360 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -544,3 +544,69 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
         )
         role = cursor.fetchone()
         assert role is None
+
+
+def test_db_with_custom_settings(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can work with databases that have some custom settings.
+    For example, role=some_other_role, default_transaction_read_only=on,
+    search_path=non_public_schema, statement_timeout=1 (1ms).
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    TEST_ROLE = "some_other_role"
+    TEST_DB = "db_with_custom_settings"
+    TEST_SCHEMA = "non_public_schema"
+
+    endpoint.respec_deep(
+        **{
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": [
+                        {
+                            "name": TEST_DB,
+                            "owner": TEST_ROLE,
+                        }
+                    ],
+                    "roles": [
+                        {
+                            "name": TEST_ROLE,
+                        }
+                    ],
+                },
+            }
+        }
+    )
+
+    endpoint.reconfigure()
+
+    with endpoint.cursor(dbname=TEST_DB) as cursor:
+        cursor.execute(f"CREATE SCHEMA {TEST_SCHEMA}")
+        cursor.execute(f"ALTER DATABASE {TEST_DB} SET role = {TEST_ROLE}")
+        cursor.execute(f"ALTER DATABASE {TEST_DB} SET default_transaction_read_only = on")
+        cursor.execute(f"ALTER DATABASE {TEST_DB} SET search_path = {TEST_SCHEMA}")
+        cursor.execute(f"ALTER DATABASE {TEST_DB} SET statement_timeout = 1")
+
+    with endpoint.cursor(dbname=TEST_DB) as cursor:
+        cursor.execute("SELECT current_role")
+        role = cursor.fetchone()
+        assert role is not None
+        assert role[0] == TEST_ROLE
+
+        cursor.execute("SHOW default_transaction_read_only")
+        default_transaction_read_only = cursor.fetchone()
+        assert default_transaction_read_only is not None
+        assert default_transaction_read_only[0] == "on"
+
+        cursor.execute("SHOW search_path")
+        search_path = cursor.fetchone()
+        assert search_path is not None
+        assert search_path[0] == TEST_SCHEMA
+
+        # Do not check statement_timeout, because we force it to 2min
+        # in `endpoint.cursor()` fixture.
+
+    endpoint.reconfigure()

From 4d2e4b19c3dc8816668abc4204b110f1c9fd1b1e Mon Sep 17 00:00:00 2001
From: Shockingly Good <victor@neon.tech>
Date: Wed, 7 May 2025 18:34:08 +0200
Subject: [PATCH 63/76] fix(compute) Correct the PGXN s3 gateway URL. (#11796)

Corrects the postgres extension s3 gateway address to
be not just a domain name but a full base URL.

To make the code more readable, the option is renamed
to "remote_ext_base_url", while keeping the old name
also accessible by providing a clap argument alias.

Also provides a very simple and, perhaps, even redundant
unit test to confirm the logic behind parsing of the
corresponding CLI argument.

## Problem

As it is clearly stated in
https://github.com/neondatabase/cloud/issues/26005, using of the short
version of the domain name might work for now, but in the future, we
should get rid of using the `default` namespace and this is where it
will, most likely, break down.

## Summary of changes

The changes adjust the domain name of the extension s3 gateway to use
the proper base url format instead of the just domain name assuming the
"default" namespace and add a new CLI argument name for to reflect the
change and the expectance.
---
 compute_tools/src/bin/compute_ctl.rs          | 34 +++++++++++++++----
 compute_tools/src/compute.rs                  | 10 +++---
 compute_tools/src/extension_server.rs         |  8 ++---
 .../src/http/routes/extension_server.rs       |  2 +-
 control_plane/src/bin/neon_local.rs           |  9 ++---
 control_plane/src/endpoint.rs                 |  6 ++--
 test_runner/fixtures/neon_cli.py              |  6 ++--
 test_runner/fixtures/neon_fixtures.py         | 12 +++----
 .../regress/test_download_extensions.py       |  4 +--
 9 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e337ee7b15..20b5e567a8 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -60,12 +60,16 @@ use utils::failpoint_support;
 // Compatibility hack: if the control plane specified any remote-ext-config
 // use the default value for extension storage proxy gateway.
 // Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_config(arg: &str) -> Result<String> {
-    if arg.starts_with("http") {
-        Ok(arg.trim_end_matches('/').to_string())
+fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
+    const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
+        "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
+
+    Ok(if arg.starts_with("http") {
+        arg
     } else {
-        Ok("http://pg-ext-s3-gateway".to_string())
+        FALLBACK_PG_EXT_GATEWAY_BASE_URL
     }
+    .to_owned())
 }
 
 #[derive(Parser)]
@@ -74,8 +78,10 @@ struct Cli {
     #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
     pub pgbin: String,
 
-    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
-    pub remote_ext_config: Option<String>,
+    /// The base URL for the remote extension storage proxy gateway.
+    /// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
+    pub remote_ext_base_url: Option<String>,
 
     /// The port to bind the external listening HTTP server to. Clients running
     /// outside the compute will talk to the compute through this port. Keep
@@ -164,7 +170,7 @@ fn main() -> Result<()> {
             pgversion: get_pg_version_string(&cli.pgbin),
             external_http_port: cli.external_http_port,
             internal_http_port: cli.internal_http_port,
-            ext_remote_storage: cli.remote_ext_config.clone(),
+            remote_ext_base_url: cli.remote_ext_base_url.clone(),
             resize_swap_on_bind: cli.resize_swap_on_bind,
             set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
             #[cfg(target_os = "linux")]
@@ -265,4 +271,18 @@ mod test {
     fn verify_cli() {
         Cli::command().debug_assert()
     }
+
+    #[test]
+    fn parse_pg_ext_gateway_base_url() {
+        let arg = "http://pg-ext-s3-gateway2";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(result, arg);
+
+        let arg = "pg-ext-s3-gateway";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(
+            result,
+            "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
+        );
+    }
 }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0cda36a6e2..25920675c1 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -95,7 +95,7 @@ pub struct ComputeNodeParams {
     pub internal_http_port: u16,
 
     /// the address of extension storage proxy gateway
-    pub ext_remote_storage: Option<String>,
+    pub remote_ext_base_url: Option<String>,
 }
 
 /// Compute node info shared across several `compute_ctl` threads.
@@ -1896,9 +1896,9 @@ LIMIT 100",
         real_ext_name: String,
         ext_path: RemotePath,
     ) -> Result<u64, DownloadError> {
-        let ext_remote_storage =
+        let remote_ext_base_url =
             self.params
-                .ext_remote_storage
+                .remote_ext_base_url
                 .as_ref()
                 .ok_or(DownloadError::BadInput(anyhow::anyhow!(
                     "Remote extensions storage is not configured",
@@ -1960,7 +1960,7 @@ LIMIT 100",
         let download_size = extension_server::download_extension(
             &real_ext_name,
             &ext_path,
-            ext_remote_storage,
+            remote_ext_base_url,
             &self.params.pgbin,
         )
         .await
@@ -2069,7 +2069,7 @@ LIMIT 100",
         &self,
         spec: &ComputeSpec,
     ) -> Result<RemoteExtensionMetrics> {
-        if self.params.ext_remote_storage.is_none() {
+        if self.params.remote_ext_base_url.is_none() {
             return Ok(RemoteExtensionMetrics {
                 num_ext_downloaded: 0,
                 largest_ext_size: 0,
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index ee889e0c40..3439383699 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
 pub async fn download_extension(
     ext_name: &str,
     ext_path: &RemotePath,
-    ext_remote_storage: &str,
+    remote_ext_base_url: &str,
     pgbin: &str,
 ) -> Result<u64> {
     info!("Download extension {:?} from {:?}", ext_name, ext_path);
 
     // TODO add retry logic
     let download_buffer =
-        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
+        match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
             Ok(buffer) => buffer,
             Err(error_message) => {
                 return Err(anyhow::anyhow!(
@@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 // Do request to extension storage proxy, e.g.,
 // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
 // using HTTP GET and return the response body as bytes.
-async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
-    let uri = format!("{}/{}", ext_remote_storage, ext_path);
+async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
+    let uri = format!("{}/{}", remote_ext_base_url, ext_path);
     let filename = Path::new(ext_path)
         .file_name()
         .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
index 6508de6eee..e141a48b7f 100644
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension(
     State(compute): State<Arc<ComputeNode>>,
 ) -> Response {
     // Don't even try to download extensions if no remote storage is configured
-    if compute.params.ext_remote_storage.is_none() {
+    if compute.params.remote_ext_base_url.is_none() {
         return JsonResponse::error(
             StatusCode::PRECONDITION_FAILED,
             "remote storage is not configured",
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index fd625e9ed6..610fa5f865 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -644,9 +644,10 @@ struct EndpointStartCmdArgs {
 
     #[clap(
         long,
-        help = "Configure the remote extensions storage proxy gateway to request for extensions."
+        help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
+        alias = "remote-ext-config"
     )]
-    remote_ext_config: Option<String>,
+    remote_ext_base_url: Option<String>,
 
     #[clap(
         long,
@@ -1414,7 +1415,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
         EndpointCmd::Start(args) => {
             let endpoint_id = &args.endpoint_id;
             let pageserver_id = args.endpoint_pageserver_id;
-            let remote_ext_config = &args.remote_ext_config;
+            let remote_ext_base_url = &args.remote_ext_base_url;
 
             let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
             // If --safekeepers argument is given, use only the listed
@@ -1510,7 +1511,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                     safekeepers_generation,
                     safekeepers,
                     pageservers,
-                    remote_ext_config.as_ref(),
+                    remote_ext_base_url.as_ref(),
                     stripe_size.0 as usize,
                     args.create_test_user,
                     args.start_timeout,
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index be73661a3c..708745446d 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -655,7 +655,7 @@ impl Endpoint {
         safekeepers_generation: Option<SafekeeperGeneration>,
         safekeepers: Vec<NodeId>,
         pageservers: Vec<(Host, u16)>,
-        remote_ext_config: Option<&String>,
+        remote_ext_base_url: Option<&String>,
         shard_stripe_size: usize,
         create_test_user: bool,
         start_timeout: Duration,
@@ -825,8 +825,8 @@ impl Endpoint {
         .stderr(logfile.try_clone()?)
         .stdout(logfile);
 
-        if let Some(remote_ext_config) = remote_ext_config {
-            cmd.args(["--remote-ext-config", remote_ext_config]);
+        if let Some(remote_ext_base_url) = remote_ext_base_url {
+            cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
         }
 
         let child = cmd.spawn()?;
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 3be78719d7..4eaa4b7d99 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -557,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli):
         endpoint_id: str,
         safekeepers_generation: int | None = None,
         safekeepers: list[int] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         create_test_user: bool = False,
@@ -572,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli):
         extra_env_vars = env or {}
         if basebackup_request_tries is not None:
             extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
-        if remote_ext_config is not None:
-            args.extend(["--remote-ext-config", remote_ext_config])
+        if remote_ext_base_url is not None:
+            args.extend(["--remote-ext-base-url", remote_ext_base_url])
 
         if safekeepers_generation is not None:
             args.extend(["--safekeepers-generation", str(safekeepers_generation)])
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d4a750ad3b..85ad49bb4f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4226,7 +4226,7 @@ class Endpoint(PgProtocol, LogUtils):
 
     def start(
         self,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         safekeeper_generation: int | None = None,
         safekeepers: list[int] | None = None,
@@ -4252,7 +4252,7 @@ class Endpoint(PgProtocol, LogUtils):
             self.endpoint_id,
             safekeepers_generation=safekeeper_generation,
             safekeepers=self.active_safekeepers,
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             create_test_user=create_test_user,
@@ -4467,7 +4467,7 @@ class Endpoint(PgProtocol, LogUtils):
         hot_standby: bool = False,
         lsn: Lsn | None = None,
         config_lines: list[str] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         basebackup_request_tries: int | None = None,
@@ -4486,7 +4486,7 @@ class Endpoint(PgProtocol, LogUtils):
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
         ).start(
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             basebackup_request_tries=basebackup_request_tries,
@@ -4570,7 +4570,7 @@ class EndpointFactory:
         lsn: Lsn | None = None,
         hot_standby: bool = False,
         config_lines: list[str] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         basebackup_request_tries: int | None = None,
     ) -> Endpoint:
@@ -4590,7 +4590,7 @@ class EndpointFactory:
             hot_standby=hot_standby,
             config_lines=config_lines,
             lsn=lsn,
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
             pageserver_id=pageserver_id,
             basebackup_request_tries=basebackup_request_tries,
         )
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index d28240c722..24ba0713d2 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -221,7 +221,7 @@ def test_remote_extensions(
 
     endpoint.create_remote_extension_spec(spec)
 
-    endpoint.start(remote_ext_config=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=extensions_endpoint)
 
     with endpoint.connect() as conn:
         with conn.cursor() as cur:
@@ -249,7 +249,7 @@ def test_remote_extensions(
     # Remove the extension files to force a redownload of the extension.
     extension.remove(test_output_dir, pg_version)
 
-    endpoint.start(remote_ext_config=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=extensions_endpoint)
 
     # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions.
     with endpoint.connect() as conn:

From 24d62c647fba00d1ac93f4118836ceeddf07b270 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Wed, 7 May 2025 21:00:41 +0400
Subject: [PATCH 64/76] storcon: add missing switch_timeline_membership method
 to sk client (#11850)

## Problem

`switch_timeline_membership` is implemented on safekeeper's server side,
but the is missing in the client.

- Part of https://github.com/neondatabase/neon/issues/11823

## Summary of changes
- Add `switch_timeline_membership` method to `SafekeeperClient`
---
 safekeeper/client/src/mgmt_api.rs           | 14 ++++++++++++++
 storage_controller/src/safekeeper_client.rs | 17 +++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index 5849df0343..b364ac8e48 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -121,6 +121,20 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    pub async fn switch_timeline_membership(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &models::TimelineMembershipSwitchRequest,
+    ) -> Result<models::TimelineMembershipSwitchResponse> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/membership",
+            self.mgmt_api_endpoint, tenant_id, timeline_id
+        );
+        let resp = self.put(&uri, req).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result<models::TenantDeleteResult> {
         let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id);
         let resp = self
diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs
index 988159af4a..1f3ea96d96 100644
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -98,6 +98,23 @@ impl SafekeeperClient {
         )
     }
 
+    #[allow(unused)]
+    pub(crate) async fn switch_timeline_membership(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &models::TimelineMembershipSwitchRequest,
+    ) -> Result<models::TimelineMembershipSwitchResponse> {
+        measured_request!(
+            "switch_timeline_membership",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .switch_timeline_membership(tenant_id, timeline_id, req)
+                .await
+        )
+    }
+
     pub(crate) async fn delete_tenant(
         &self,
         tenant_id: TenantId,

From 7eb85c56acb5f87c730b879c9488e217448ee28b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 08:33:29 +0200
Subject: [PATCH 65/76] tokio-epoll-uring: avoid warn! noise due to `ECANCELED`
 during shutdowns (#11819)

# Problem

Before this PR, `test_pageserver_catchup_while_compute_down` would
occasionally fail due to scary-looking WARN log line

```
WARN ephemeral_file_buffered_writer{...}:flush_attempt{attempt=1}: \
 error flushing buffered writer buffer to disk, retrying after backoff err=Operation canceled (os error 125)
```

After lengthy investigation, the conclusion is that this is likely due
to a kernel bug related due to io_uring async workers (io-wq) and
signals.
The main indicator is that the error only ever happens in correlation
with pageserver shtudown when SIGTERM is received.
There is a fix that is merged in 6.14
kernels (`io-wq: backoff when retrying worker creation`).
However, even when I revert that patch, the issue is not reproducible
on 6.14, so, it remains a speculation.

It was ruled out that the ECANCELED is due to the executor thread
exiting before the async worker starts processing the operation.

# Solution

The workaround in this issue is to retry the operation on ECANCELED
once.
Retries are safe because the low-level io_engine operations are
idempotent.
(We don't use O_APPEND and I can't think of another flag that would make
 the APIs covered by this patch not idempotent.)

# Testing

With this PR, the warn! log no longer happens on [my reproducer
setup](https://github.com/neondatabase/neon/issues/11446#issuecomment-2843015111).
And the new rate-limited `info!`-level log line informing about the
internal retry shows up instead, as expected.

# Refs
- fixes https://github.com/neondatabase/neon/issues/11446
---
 libs/utils/src/rate_limit.rs                |  2 +-
 pageserver/src/virtual_file/io_engine.rs    | 85 +++++++++++++++++++--
 pageserver/src/virtual_file/open_options.rs | 18 +++--
 3 files changed, 91 insertions(+), 14 deletions(-)

diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs
index 945f710b1d..700cd5792b 100644
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats {
 }
 
 impl RateLimit {
-    pub fn new(interval: Duration) -> Self {
+    pub const fn new(interval: Duration) -> Self {
         Self {
             last: None,
             interval,
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index dd04fb561a..d8eb803335 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -13,7 +13,7 @@
 pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::IoBuf;
-use tracing::Instrument;
+use tracing::{Instrument, info};
 
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
@@ -111,13 +111,16 @@ pub(crate) fn get() -> IoEngine {
 
 use std::os::unix::prelude::FileExt;
 use std::sync::atomic::{AtomicU8, Ordering};
+use std::time::Duration;
 
 use super::owned_buffers_io::io_buf_ext::FullSlice;
 use super::owned_buffers_io::slice::SliceMutExt;
 use super::{FileGuard, Metadata};
 
 #[cfg(target_os = "linux")]
-fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
+pub(super) fn epoll_uring_error_to_std(
+    e: tokio_epoll_uring::Error<std::io::Error>,
+) -> std::io::Error {
     match e {
         tokio_epoll_uring::Error::Op(e) => e,
         tokio_epoll_uring::Error::System(system) => {
@@ -149,7 +152,11 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.read(file_guard, offset, slice).await;
+                let (resources, res) =
+                    retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async {
+                        system.read(file_guard, offset, slice).await
+                    })
+                    .await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
@@ -164,7 +171,10 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.fsync(file_guard).await;
+                let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async {
+                    system.fsync(file_guard).await
+                })
+                .await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
@@ -182,7 +192,10 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.fdatasync(file_guard).await;
+                let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async {
+                    system.fdatasync(file_guard).await
+                })
+                .await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
@@ -201,7 +214,10 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.statx(file_guard).await;
+                let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async {
+                    system.statx(file_guard).await
+                })
+                .await;
                 (
                     resources,
                     res.map_err(epoll_uring_error_to_std).map(Metadata::from),
@@ -224,6 +240,7 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 // TODO: ftruncate op for tokio-epoll-uring
+                // Don't forget to use retry_ecanceled_once
                 let res = file_guard.with_std_file(|std_file| std_file.set_len(len));
                 (file_guard, res)
             }
@@ -245,8 +262,11 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let ((file_guard, slice), res) =
-                    system.write(file_guard, offset, buf.into_raw_slice()).await;
+                let ((file_guard, slice), res) = retry_ecanceled_once(
+                    (file_guard, buf.into_raw_slice()),
+                    async |(file_guard, buf)| system.write(file_guard, offset, buf).await,
+                )
+                .await;
                 (
                     (file_guard, FullSlice::must_new(slice)),
                     res.map_err(epoll_uring_error_to_std),
@@ -282,6 +302,55 @@ impl IoEngine {
     }
 }
 
+/// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data,
+/// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED.
+/// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals.
+/// Investigation ticket: <https://github.com/neondatabase/neon/issues/11446>
+///
+/// This function retries the operation once if it fails with ECANCELED.
+/// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations.
+pub(super) async fn retry_ecanceled_once<F, Fut, T, V>(
+    resources: T,
+    f: F,
+) -> (T, Result<V, tokio_epoll_uring::Error<std::io::Error>>)
+where
+    F: Fn(T) -> Fut,
+    Fut: std::future::Future<Output = (T, Result<V, tokio_epoll_uring::Error<std::io::Error>>)>,
+    T: Send,
+    V: Send,
+{
+    let (resources, res) = f(resources).await;
+    let Err(e) = res else {
+        return (resources, res);
+    };
+    let tokio_epoll_uring::Error::Op(err) = e else {
+        return (resources, Err(e));
+    };
+    if err.raw_os_error() != Some(nix::libc::ECANCELED) {
+        return (resources, Err(tokio_epoll_uring::Error::Op(err)));
+    }
+    {
+        static RATE_LIMIT: std::sync::Mutex<utils::rate_limit::RateLimit> =
+            std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1)));
+        let mut guard = RATE_LIMIT.lock().unwrap();
+        guard.call2(|rate_limit_stats| {
+            info!(
+                %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited"
+            );
+        });
+        drop(guard);
+    }
+    tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners
+    let (resources, res) = f(resources).await;
+    (resources, res)
+}
+
+pub(super) fn panic_operation_must_be_idempotent() {
+    panic!(
+        "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)"
+    )
+}
+
 pub enum FeatureTestResult {
     PlatformPreferred(IoEngineKind),
     Worse {
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index 2a7bb693f2..a40dfed4a4 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -110,18 +110,23 @@ impl OpenOptions {
         self
     }
 
+    /// Don't use, `O_APPEND` is not supported.
+    pub fn append(&mut self, _append: bool) {
+        super::io_engine::panic_operation_must_be_idempotent();
+    }
+
     pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
         match &self.inner {
             Inner::StdFs(x) => x.open(path).map(|file| file.into()),
             #[cfg(target_os = "linux")]
             Inner::TokioEpollUring(x) => {
                 let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
-                system.open(path, x).await.map_err(|e| match e {
-                    tokio_epoll_uring::Error::Op(e) => e,
-                    tokio_epoll_uring::Error::System(system) => {
-                        std::io::Error::new(std::io::ErrorKind::Other, system)
-                    }
+                let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async {
+                    let res = system.open(path, x).await;
+                    ((), res)
                 })
+                .await;
+                res.map_err(super::io_engine::epoll_uring_error_to_std)
             }
         }
     }
@@ -140,6 +145,9 @@ impl OpenOptions {
     }
 
     pub fn custom_flags(mut self, flags: i32) -> Self {
+        if flags & nix::libc::O_APPEND != 0 {
+            super::io_engine::panic_operation_must_be_idempotent();
+        }
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.custom_flags(flags);

From 1d1502bc167a2d0372756650581b4666597120c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 08:57:53 +0200
Subject: [PATCH 66/76] fix(pageserver): `flush task cancelled` errors during
 timeline shutdown (#11853)

# Refs
- fixes https://github.com/neondatabase/neon/issues/11762

# Problem

PR #10993 introduced internal retries for BufferedWriter flushes.
PR #11052 added cancellation sensitivity to that retry loop.
That cancellation sensitivity is an error path that didn't exist before.

The result is that during timeline shutdown, after we
`Timeline::cancel`, compaction can now fail with error `flush task
cancelled`.
The problem with that:
1. We mis-classify this as an `error!`-worthy event.
2. This causes tests to become flaky because the error is not in global
`allowed_errors`.

Technically we also trip the `compaction_circuit_breaker` because the
resulting `CompactionError` is variant `::Other`.
But since this is Timeline shutdown, is doesn't matter practically
speaking.

# Solution / Changes

- Log the anyhow stack trace when classifying a compaction error as
`error!`.
  This was helpful to identify sources of `flush task cancelled` errors.
We only log at `error!` level in exceptional circumstances, so, it's ok
to have bit verbose logs.
- Introduce typed errors along the `BufferedWriter::write_*`=>
`BlobWriter::write_blob`
=> `{Delta,Image}LayerWriter::put_*` =>
`Split{Delta,Image}LayerWriter::put_{value,image}` chain.
- Proper mapping to `CompactionError`/`CreateImageLayersError` via new
`From` impls.

I am usually opposed to any magic `From` impls, but, it's how most of
the compaction code
works today.

# Testing

The symptoms are most prevalent in
`test_runner/regress/test_branch_and_gc.py::test_branch_and_gc`.
Before this PR, I was able to reproduce locally 1 or 2 times per 400
runs using
`DEFAULT_PG_VERSION=15 BUILD_TYPE=release poetry run pytest --count 400
-n 8`.
After this PR, it doesn't reproduce anymore after 2000 runs.

# Future Work

Technically the ingest path is also exposed to this new source of errors
because `InMemoryLayer` is backed by `BufferedWriter`.
But we haven't seen it occur in flaky tests yet.
Details and a fix in
- https://github.com/neondatabase/neon/pull/11851
---
 pageserver/src/tenant/blob_io.rs              | 27 ++++++++++++++-----
 pageserver/src/tenant/storage_layer.rs        |  1 +
 .../storage_layer/batch_split_writer.rs       | 18 ++++++++-----
 .../src/tenant/storage_layer/delta_layer.rs   | 25 +++++++++++------
 pageserver/src/tenant/storage_layer/errors.rs | 24 +++++++++++++++++
 .../src/tenant/storage_layer/image_layer.rs   | 20 ++++++++++----
 pageserver/src/tenant/tasks.rs                |  2 +-
 pageserver/src/tenant/timeline.rs             | 20 ++++++++++++++
 pageserver/src/tenant/timeline/compaction.rs  |  3 +--
 .../owned_buffers_io/write/flush.rs           | 13 +++++++++
 10 files changed, 124 insertions(+), 29 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/errors.rs

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 8cf3c548c9..ed541c4f12 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -94,10 +94,23 @@ impl Header {
 pub enum WriteBlobError {
     #[error(transparent)]
     Flush(FlushTaskError),
-    #[error("blob too large ({len} bytes)")]
-    BlobTooLarge { len: usize },
     #[error(transparent)]
-    WriteBlobRaw(anyhow::Error),
+    Other(anyhow::Error),
+}
+
+impl WriteBlobError {
+    pub fn is_cancel(&self) -> bool {
+        match self {
+            WriteBlobError::Flush(e) => e.is_cancel(),
+            WriteBlobError::Other(_) => false,
+        }
+    }
+    pub fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            WriteBlobError::Flush(e) => e.into_anyhow(),
+            WriteBlobError::Other(e) => e,
+        }
+    }
 }
 
 impl BlockCursor<'_> {
@@ -327,7 +340,9 @@ where
                     return (
                         (
                             io_buf.slice_len(),
-                            Err(WriteBlobError::BlobTooLarge { len }),
+                            Err(WriteBlobError::Other(anyhow::anyhow!(
+                                "blob too large ({len} bytes)"
+                            ))),
                         ),
                         srcbuf,
                     );
@@ -391,7 +406,7 @@ where
         // Verify the header, to ensure we don't write invalid/corrupt data.
         let header = match Header::decode(&raw_with_header)
             .context("decoding blob header")
-            .map_err(WriteBlobError::WriteBlobRaw)
+            .map_err(WriteBlobError::Other)
         {
             Ok(header) => header,
             Err(err) => return (raw_with_header, Err(err)),
@@ -401,7 +416,7 @@ where
             let raw_len = raw_with_header.len();
             return (
                 raw_with_header,
-                Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!(
+                Err(WriteBlobError::Other(anyhow::anyhow!(
                     "header length mismatch: {header_total_len} != {raw_len}"
                 ))),
             );
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 796ad01e54..5dfa961b71 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,6 +2,7 @@
 
 pub mod batch_split_writer;
 pub mod delta_layer;
+pub mod errors;
 pub mod filter_iterator;
 pub mod image_layer;
 pub mod inmemory_layer;
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 39cd02d101..51f2e909a2 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -10,6 +10,7 @@ use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::shard::TenantShardId;
 
+use super::errors::PutError;
 use super::layer::S3_UPLOAD_LIMIT;
 use super::{
     DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
@@ -235,7 +236,7 @@ impl<'a> SplitImageLayerWriter<'a> {
         key: Key,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         // The current estimation is an upper bound of the space that the key/image could take
         // because we did not consider compression in this estimation. The resulting image layer
         // could be smaller than the target size.
@@ -253,7 +254,8 @@ impl<'a> SplitImageLayerWriter<'a> {
                 self.cancel.clone(),
                 ctx,
             )
-            .await?;
+            .await
+            .map_err(PutError::Other)?;
             let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
             self.batches.add_unfinished_image_writer(
                 prev_image_writer,
@@ -346,7 +348,7 @@ impl<'a> SplitDeltaLayerWriter<'a> {
         lsn: Lsn,
         val: Value,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
         // number, and therefore the final layer size could be a little bit larger or smaller than the target.
         //
@@ -366,7 +368,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
                     self.cancel.clone(),
                     ctx,
                 )
-                .await?,
+                .await
+                .map_err(PutError::Other)?,
             ));
         }
         let (_, inner) = self.inner.as_mut().unwrap();
@@ -386,7 +389,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
                     self.cancel.clone(),
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(PutError::Other)?;
                 let (start_key, prev_delta_writer) =
                     self.inner.replace((key, next_delta_writer)).unwrap();
                 self.batches.add_unfinished_delta_writer(
@@ -396,11 +400,11 @@ impl<'a> SplitDeltaLayerWriter<'a> {
                 );
             } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
-                anyhow::bail!(
+                return Err(PutError::Other(anyhow::anyhow!(
                     "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
                     key,
                     inner.estimated_size()
-                );
+                )));
             }
         }
         self.last_key_written = key;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 11875ac653..2c1b27c8d5 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -55,6 +55,7 @@ use utils::bin_ser::SerializeError;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use super::errors::PutError;
 use super::{
     AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
     ValuesReconstructState,
@@ -477,12 +478,15 @@ impl DeltaLayerWriterInner {
         lsn: Lsn,
         val: Value,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         let (_, res) = self
             .put_value_bytes(
                 key,
                 lsn,
-                Value::ser(&val)?.slice_len(),
+                Value::ser(&val)
+                    .map_err(anyhow::Error::new)
+                    .map_err(PutError::Other)?
+                    .slice_len(),
                 val.will_init(),
                 ctx,
             )
@@ -497,7 +501,7 @@ impl DeltaLayerWriterInner {
         val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    ) -> (FullSlice<Buf>, Result<(), PutError>)
     where
         Buf: IoBuf + Send,
     {
@@ -513,19 +517,24 @@ impl DeltaLayerWriterInner {
             .blob_writer
             .write_blob_maybe_compressed(val, ctx, compression)
             .await;
+        let res = res.map_err(PutError::WriteBlob);
         let off = match res {
             Ok((off, _)) => off,
-            Err(e) => return (val, Err(anyhow::anyhow!(e))),
+            Err(e) => return (val, Err(e)),
         };
 
         let blob_ref = BlobRef::new(off, will_init);
 
         let delta_key = DeltaKey::from_key_lsn(&key, lsn);
-        let res = self.tree.append(&delta_key.0, blob_ref.0);
+        let res = self
+            .tree
+            .append(&delta_key.0, blob_ref.0)
+            .map_err(anyhow::Error::new)
+            .map_err(PutError::Other);
 
         self.num_keys += 1;
 
-        (val, res.map_err(|e| anyhow::anyhow!(e)))
+        (val, res)
     }
 
     fn size(&self) -> u64 {
@@ -694,7 +703,7 @@ impl DeltaLayerWriter {
         lsn: Lsn,
         val: Value,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         self.inner
             .as_mut()
             .unwrap()
@@ -709,7 +718,7 @@ impl DeltaLayerWriter {
         val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    ) -> (FullSlice<Buf>, Result<(), PutError>)
     where
         Buf: IoBuf + Send,
     {
diff --git a/pageserver/src/tenant/storage_layer/errors.rs b/pageserver/src/tenant/storage_layer/errors.rs
new file mode 100644
index 0000000000..591e489faa
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/errors.rs
@@ -0,0 +1,24 @@
+use crate::tenant::blob_io::WriteBlobError;
+
+#[derive(Debug, thiserror::Error)]
+pub enum PutError {
+    #[error(transparent)]
+    WriteBlob(WriteBlobError),
+    #[error(transparent)]
+    Other(anyhow::Error),
+}
+
+impl PutError {
+    pub fn is_cancel(&self) -> bool {
+        match self {
+            PutError::WriteBlob(e) => e.is_cancel(),
+            PutError::Other(_) => false,
+        }
+    }
+    pub fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            PutError::WriteBlob(e) => e.into_anyhow(),
+            PutError::Other(e) => e,
+        }
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index d684230572..740f53f928 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -53,6 +53,7 @@ use utils::bin_ser::SerializeError;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use super::errors::PutError;
 use super::layer_name::ImageLayerName;
 use super::{
     AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
@@ -842,8 +843,14 @@ impl ImageLayerWriterInner {
         key: Key,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        ensure!(self.key_range.contains(&key));
+    ) -> Result<(), PutError> {
+        if !self.key_range.contains(&key) {
+            return Err(PutError::Other(anyhow::anyhow!(
+                "key {:?} not in range {:?}",
+                key,
+                self.key_range
+            )));
+        }
         let compression = self.conf.image_compression;
         let uncompressed_len = img.len() as u64;
         self.uncompressed_bytes += uncompressed_len;
@@ -853,7 +860,7 @@ impl ImageLayerWriterInner {
             .write_blob_maybe_compressed(img.slice_len(), ctx, compression)
             .await;
         // TODO: re-use the buffer for `img` further upstack
-        let (off, compression_info) = res?;
+        let (off, compression_info) = res.map_err(PutError::WriteBlob)?;
         if compression_info.compressed_size.is_some() {
             // The image has been considered for compression at least
             self.uncompressed_bytes_eligible += uncompressed_len;
@@ -865,7 +872,10 @@ impl ImageLayerWriterInner {
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
-        self.tree.append(&keybuf, off)?;
+        self.tree
+            .append(&keybuf, off)
+            .map_err(anyhow::Error::new)
+            .map_err(PutError::Other)?;
 
         #[cfg(feature = "testing")]
         {
@@ -1085,7 +1095,7 @@ impl ImageLayerWriter {
         key: Key,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1112a5330b..4709a6d616 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -340,7 +340,7 @@ pub(crate) fn log_compaction_error(
     } else {
         match level {
             Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
-            Level::ERROR => error!("Compaction failed: {err:#}"),
+            Level::ERROR => error!("Compaction failed: {err:?}"),
             Level::INFO => info!("Compaction failed: {err:#}"),
             level => unimplemented!("unexpected level {level:?}"),
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cfeab77598..c8d897d074 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -987,6 +987,16 @@ impl From<PageReconstructError> for CreateImageLayersError {
     }
 }
 
+impl From<super::storage_layer::errors::PutError> for CreateImageLayersError {
+    fn from(e: super::storage_layer::errors::PutError) -> Self {
+        if e.is_cancel() {
+            CreateImageLayersError::Cancelled
+        } else {
+            CreateImageLayersError::Other(e.into_anyhow())
+        }
+    }
+}
+
 impl From<GetVectoredError> for CreateImageLayersError {
     fn from(e: GetVectoredError) -> Self {
         match e {
@@ -5923,6 +5933,16 @@ impl From<layer_manager::Shutdown> for CompactionError {
     }
 }
 
+impl From<super::storage_layer::errors::PutError> for CompactionError {
+    fn from(e: super::storage_layer::errors::PutError) -> Self {
+        if e.is_cancel() {
+            CompactionError::ShuttingDown
+        } else {
+            CompactionError::Other(e.into_anyhow())
+        }
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d0c13d86ce..07cd274a41 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2204,8 +2204,7 @@ impl Timeline {
                     .as_mut()
                     .unwrap()
                     .put_value(key, lsn, value, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?;
+                    .await?;
             } else {
                 let owner = self.shard_identity.get_shard_number(&key);
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
index b41a9f6cd2..ac9867e8b4 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -247,6 +247,19 @@ pub enum FlushTaskError {
     Cancelled,
 }
 
+impl FlushTaskError {
+    pub fn is_cancel(&self) -> bool {
+        match self {
+            FlushTaskError::Cancelled => true,
+        }
+    }
+    pub fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            FlushTaskError::Cancelled => anyhow::anyhow!(self),
+        }
+    }
+}
+
 impl<Buf, W> FlushBackgroundTask<Buf, W>
 where
     Buf: IoBufAligned + Send + Sync,

From 40f32ea326ac9f8b691f179d0ced414470eb06ff Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 8 May 2025 10:19:14 +0100
Subject: [PATCH 67/76] pageserver: refactor import flow and add job
 concurrency limiting (#11816)

## Problem

Import code is one big block. Separating planning and execution will
help with reporting
progress of import to storcon (building block for resuming import).

## Summary of changes

Split up the import into planning and execution.
A concurrency limit driven by PS config is also added.
---
 libs/pageserver_api/src/config.rs             |  11 +
 pageserver/src/config.rs                      |   4 +
 .../src/tenant/timeline/import_pgdata.rs      |   9 +-
 .../src/tenant/timeline/import_pgdata/flow.rs | 195 ++++++++++--------
 4 files changed, 129 insertions(+), 90 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index b64c42a808..5b0c13dd89 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -182,6 +182,7 @@ pub struct ConfigToml {
     pub tracing: Option<Tracing>,
     pub enable_tls_page_service_api: bool,
     pub dev_mode: bool,
+    pub timeline_import_config: TimelineImportConfig,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -300,6 +301,12 @@ impl From<OtelExporterProtocol> for tracing_utils::Protocol {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct TimelineImportConfig {
+    pub import_job_concurrency: NonZeroUsize,
+    pub import_job_soft_size_limit: NonZeroUsize,
+}
+
 pub mod statvfs {
     pub mod mock {
         #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -659,6 +666,10 @@ impl Default for ConfigToml {
             tracing: None,
             enable_tls_page_service_api: false,
             dev_mode: false,
+            timeline_import_config: TimelineImportConfig {
+                import_job_concurrency: NonZeroUsize::new(128).unwrap(),
+                import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
+            },
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ded2805602..7e773f56b3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -230,6 +230,8 @@ pub struct PageServerConf {
     /// such as authentication requirements for HTTP and PostgreSQL APIs.
     /// This is insecure and should only be used in development environments.
     pub dev_mode: bool,
+
+    pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
 }
 
 /// Token for authentication to safekeepers
@@ -404,6 +406,7 @@ impl PageServerConf {
             tracing,
             enable_tls_page_service_api,
             dev_mode,
+            timeline_import_config,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -457,6 +460,7 @@ impl PageServerConf {
             tracing,
             enable_tls_page_service_api,
             dev_mode,
+            timeline_import_config,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index 6ab6b90cb6..c4a8df39a3 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -149,14 +149,7 @@ pub async fn doit(
         }
         .await?;
 
-        flow::run(
-            timeline.clone(),
-            base_lsn,
-            control_file,
-            storage.clone(),
-            ctx,
-        )
-        .await?;
+        flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?;
 
         //
         // Communicate that shard is done.
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index c6d2944769..34c073365d 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -34,7 +34,9 @@ use std::sync::Arc;
 
 use anyhow::{bail, ensure};
 use bytes::Bytes;
+use futures::stream::FuturesOrdered;
 use itertools::Itertools;
+use pageserver_api::config::TimelineImportConfig;
 use pageserver_api::key::{
     CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key,
     rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -46,8 +48,9 @@ use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::relfile_utils::parse_relfilename;
 use postgres_ffi::{BLCKSZ, pg_constants};
 use remote_storage::RemotePath;
-use tokio::task::JoinSet;
-use tracing::{Instrument, debug, info_span, instrument};
+use tokio::sync::Semaphore;
+use tokio_stream::StreamExt;
+use tracing::{debug, instrument};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
 
@@ -63,37 +66,39 @@ use crate::tenant::storage_layer::{ImageLayerWriter, Layer};
 
 pub async fn run(
     timeline: Arc<Timeline>,
-    pgdata_lsn: Lsn,
     control_file: ControlFile,
     storage: RemoteStorageWrapper,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
-    Flow {
-        timeline,
-        pgdata_lsn,
+    let planner = Planner {
         control_file,
-        tasks: Vec::new(),
-        storage,
-    }
-    .run(ctx)
-    .await
+        storage: storage.clone(),
+        shard: timeline.shard_identity,
+        tasks: Vec::default(),
+    };
+
+    let import_config = &timeline.conf.timeline_import_config;
+    let plan = planner.plan(import_config).await?;
+    plan.execute(timeline, import_config, ctx).await
 }
 
-struct Flow {
-    timeline: Arc<Timeline>,
-    pgdata_lsn: Lsn,
+struct Planner {
     control_file: ControlFile,
-    tasks: Vec<AnyImportTask>,
     storage: RemoteStorageWrapper,
+    shard: ShardIdentity,
+    tasks: Vec<AnyImportTask>,
 }
 
-impl Flow {
-    /// Perform the ingestion into [`Self::timeline`].
-    /// Assumes the timeline is empty (= no layers).
-    pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
+struct Plan {
+    jobs: Vec<ChunkProcessingJob>,
+}
 
-        self.pgdata_lsn = pgdata_lsn;
+impl Planner {
+    /// Creates an import plan
+    ///
+    /// This function is and must remain pure: given the same input, it will generate the same import plan.
+    async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result<Plan> {
+        let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
 
         let datadir = PgDataDir::new(&self.storage).await?;
 
@@ -115,7 +120,7 @@ impl Flow {
         }
 
         // Import SLRUs
-        if self.timeline.tenant_shard_id.is_shard_zero() {
+        if self.shard.is_shard_zero() {
             // pg_xact (01:00 keyspace)
             self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
                 .await?;
@@ -166,14 +171,16 @@ impl Flow {
         let mut last_end_key = Key::MIN;
         let mut current_chunk = Vec::new();
         let mut current_chunk_size: usize = 0;
-        let mut parallel_jobs = Vec::new();
+        let mut jobs = Vec::new();
         for task in std::mem::take(&mut self.tasks).into_iter() {
-            if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 {
+            if current_chunk_size + task.total_size()
+                > import_config.import_job_soft_size_limit.into()
+            {
                 let key_range = last_end_key..task.key_range().start;
-                parallel_jobs.push(ChunkProcessingJob::new(
+                jobs.push(ChunkProcessingJob::new(
                     key_range.clone(),
                     std::mem::take(&mut current_chunk),
-                    &self,
+                    pgdata_lsn,
                 ));
                 last_end_key = key_range.end;
                 current_chunk_size = 0;
@@ -181,45 +188,13 @@ impl Flow {
             current_chunk_size += task.total_size();
             current_chunk.push(task);
         }
-        parallel_jobs.push(ChunkProcessingJob::new(
+        jobs.push(ChunkProcessingJob::new(
             last_end_key..Key::MAX,
             current_chunk,
-            &self,
+            pgdata_lsn,
         ));
 
-        // Start all jobs simultaneosly
-        let mut work = JoinSet::new();
-        // TODO: semaphore?
-        for job in parallel_jobs {
-            let ctx: RequestContext =
-                ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
-            work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job")));
-        }
-        let mut results = Vec::new();
-        while let Some(result) = work.join_next().await {
-            match result {
-                Ok(res) => {
-                    results.push(res);
-                }
-                Err(_joinset_err) => {
-                    results.push(Err(anyhow::anyhow!(
-                        "parallel job panicked or cancelled, check pageserver logs"
-                    )));
-                }
-            }
-        }
-
-        if results.iter().all(|r| r.is_ok()) {
-            Ok(())
-        } else {
-            let mut msg = String::new();
-            for result in results {
-                if let Err(err) = result {
-                    msg.push_str(&format!("{err:?}\n\n"));
-                }
-            }
-            bail!("Some parallel jobs failed:\n\n{msg}");
-        }
+        Ok(Plan { jobs })
     }
 
     #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
@@ -266,7 +241,7 @@ impl Flow {
             let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
             self.tasks
                 .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(
-                    *self.timeline.get_shard_identity(),
+                    self.shard,
                     start_key..end_key,
                     &file.path,
                     self.storage.clone(),
@@ -289,7 +264,7 @@ impl Flow {
     }
 
     async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
-        assert!(self.timeline.tenant_shard_id.is_shard_zero());
+        assert!(self.shard.is_shard_zero());
 
         let segments = self.storage.listfilesindir(path).await?;
         let segments: Vec<(String, u32, usize)> = segments
@@ -344,6 +319,68 @@ impl Flow {
     }
 }
 
+impl Plan {
+    async fn execute(
+        self,
+        timeline: Arc<Timeline>,
+        import_config: &TimelineImportConfig,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut work = FuturesOrdered::new();
+        let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into()));
+
+        let jobs_in_plan = self.jobs.len();
+
+        let mut jobs = self.jobs.into_iter().enumerate().peekable();
+        let mut results = Vec::new();
+
+        // Run import jobs concurrently up to the limit specified by the pageserver configuration.
+        // Note that we process completed futures in the oreder of insertion. This will be the
+        // building block for resuming imports across pageserver restarts or tenant migrations.
+        while results.len() < jobs_in_plan {
+            tokio::select! {
+                permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => {
+                    let permit = permit.expect("never closed");
+                    let (job_idx, job) = jobs.next().expect("we peeked");
+                    let job_timeline = timeline.clone();
+                    let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
+
+                    work.push_back(tokio::task::spawn(async move {
+                        let _permit = permit;
+                        let res = job.run(job_timeline, &ctx).await;
+                        (job_idx, res)
+                    }));
+                },
+                maybe_complete_job_idx = work.next() => {
+                    match maybe_complete_job_idx {
+                        Some(Ok((_job_idx, res))) => {
+                            results.push(res);
+                        },
+                        Some(Err(_)) => {
+                            results.push(Err(anyhow::anyhow!(
+                                "parallel job panicked or cancelled, check pageserver logs"
+                            )));
+                        }
+                        None => {}
+                    }
+                }
+            }
+        }
+
+        if results.iter().all(|r| r.is_ok()) {
+            Ok(())
+        } else {
+            let mut msg = String::new();
+            for result in results {
+                if let Err(err) = result {
+                    msg.push_str(&format!("{err:?}\n\n"));
+                }
+            }
+            bail!("Some parallel jobs failed:\n\n{msg}");
+        }
+    }
+}
+
 //
 // dbdir iteration tools
 //
@@ -713,7 +750,6 @@ impl From<ImportSlruBlocksTask> for AnyImportTask {
 }
 
 struct ChunkProcessingJob {
-    timeline: Arc<Timeline>,
     range: Range<Key>,
     tasks: Vec<AnyImportTask>,
 
@@ -721,25 +757,24 @@ struct ChunkProcessingJob {
 }
 
 impl ChunkProcessingJob {
-    fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &Flow) -> Self {
-        assert!(env.pgdata_lsn.is_valid());
+    fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, pgdata_lsn: Lsn) -> Self {
+        assert!(pgdata_lsn.is_valid());
         Self {
-            timeline: env.timeline.clone(),
             range,
             tasks,
-            pgdata_lsn: env.pgdata_lsn,
+            pgdata_lsn,
         }
     }
 
-    async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> {
+    async fn run(self, timeline: Arc<Timeline>, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = ImageLayerWriter::new(
-            self.timeline.conf,
-            self.timeline.timeline_id,
-            self.timeline.tenant_shard_id,
+            timeline.conf,
+            timeline.timeline_id,
+            timeline.tenant_shard_id,
             &self.range,
             self.pgdata_lsn,
-            &self.timeline.gate,
-            self.timeline.cancel.clone(),
+            &timeline.gate,
+            timeline.cancel.clone(),
             ctx,
         )
         .await?;
@@ -751,24 +786,20 @@ impl ChunkProcessingJob {
 
         let resident_layer = if nimages > 0 {
             let (desc, path) = writer.finish(ctx).await?;
-            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?
+            Layer::finish_creating(timeline.conf, &timeline, desc, &path)?
         } else {
             // dropping the writer cleans up
             return Ok(());
         };
 
         // this is sharing the same code as create_image_layers
-        let mut guard = self.timeline.layers.write().await;
+        let mut guard = timeline.layers.write().await;
         guard
             .open_mut()?
-            .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics);
+            .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
         crate::tenant::timeline::drop_wlock(guard);
 
-        // Schedule the layer for upload but don't add barriers such as
-        // wait for completion or index upload, so we don't inhibit upload parallelism.
-        // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?)
-        // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level.
-        self.timeline
+        timeline
             .remote_client
             .schedule_layer_file_upload(resident_layer)?;
 

From 7e55497e131f2f26a16ae22bff80cac11951cdd4 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Thu, 8 May 2025 14:00:45 +0400
Subject: [PATCH 68/76] tests: flush wal before waiting for last record lsn
 (#11726)

## Problem
Compute may flush WAL on page boundaries, leaving some records partially
flushed for a long time.
It leads to `wait_for_last_flush_lsn` stuck waiting for this partial
LSN.
- Closes: https://github.com/neondatabase/cloud/issues/27876

## Summary of changes
- Flush WAL via CHECKPOINT after requesting current_wal_lsn to make sure
that the record we point to is flushed in full
- Use proper endpoint in
`test_timeline_detach_with_aux_files_with_detach_v1`
---
 test_runner/fixtures/neon_fixtures.py                | 7 +++++++
 test_runner/regress/test_timeline_detach_ancestor.py | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 85ad49bb4f..370eca5130 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -5477,6 +5477,13 @@ def wait_for_last_flush_lsn(
 
     if last_flush_lsn is None:
         last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        # The last_flush_lsn may not correspond to a record boundary.
+        # For example, if the compute flushed WAL on a page boundary,
+        # the remaining part of the record might not be flushed for a long time.
+        # This would prevent the pageserver from reaching last_flush_lsn promptly.
+        # To ensure the rest of the record reaches the pageserver quickly,
+        # we forcibly flush the WAL by using CHECKPOINT.
+        endpoint.safe_psql("CHECKPOINT")
 
     results = []
     for tenant_shard_id, pageserver in shards:
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index a71652af8a..d42c5d403e 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
     endpoint2.safe_psql(
         "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')"
     )
-    lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id)
     assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
     assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
         ["pg_replslot/test_slot_restore/state"]
@@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
     assert all_reparented == set([])
 
     # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN.
-    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id)
     assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
         ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
     ), "main branch unaffected"

From 6c70789cfdf145ae4ca73228884ca1359b80c302 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 8 May 2025 12:14:41 +0200
Subject: [PATCH 69/76] storcon: increase drain+fill secondary warmup timeout
 from 20 to 30 seconds (#11848)

## Problem

During deployment drains/fills, we often see the storage controller
giving up on warmups after 20 seconds, when the warmup is nearly
complete (~90%). This can cause latency spikes for migrated tenants if
they block on layer downloads.

Touches https://github.com/neondatabase/cloud/issues/26193.

## Summary of changes

Increase the drain and fill secondary warmup timeout from 20 to 30
seconds.
---
 storage_controller/src/service.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 21c693af97..fdb791c2cf 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8485,7 +8485,7 @@ impl Service {
         // By default, live migrations are generous about the wait time for getting
         // the secondary location up to speed. When draining, give up earlier in order
         // to not stall the operation when a cold secondary is encountered.
-        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
         let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
@@ -8818,7 +8818,7 @@ impl Service {
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
         let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)

From d22377c754556c95d24970458cb08968828902b3 Mon Sep 17 00:00:00 2001
From: Mark Novikov <piercypixel@gmail.com>
Date: Thu, 8 May 2025 15:04:28 +0400
Subject: [PATCH 70/76] Skip event triggers in dump-restore (#11794)

## Problem

Data import fails if the src db has any event triggers, because those
can only be restored by a superuser. Specifically imports from Heroku
and Supabase are guaranteed to fail.

Closes https://github.com/neondatabase/cloud/issues/27353

## Summary of changes

Depends on `pg_dump` patches per each supported PostgreSQL version:
- https://github.com/neondatabase/postgres/pull/630
- https://github.com/neondatabase/postgres/pull/629
- https://github.com/neondatabase/postgres/pull/627
- https://github.com/neondatabase/postgres/pull/628
---
 compute_tools/src/bin/fast_import.rs      |  1 +
 test_runner/regress/test_import_pgdata.py | 49 +++++++++++++++++++++++
 vendor/postgres-v14                       |  2 +-
 vendor/postgres-v17                       |  2 +-
 vendor/revisions.json                     |  4 +-
 5 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 537028cde1..78acd78585 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -348,6 +348,7 @@ async fn run_dump_restore(
         "--no-security-labels".to_string(),
         "--no-subscriptions".to_string(),
         "--no-tablespaces".to_string(),
+        "--no-event-triggers".to_string(),
         // format
         "--format".to_string(),
         "directory".to_string(),
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index a26c3994a5..2fda1991f7 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -641,6 +641,55 @@ def test_fast_import_binary(
         assert res[0][0] == 10
 
 
+def test_fast_import_event_triggers(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+):
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("""
+        CREATE FUNCTION test_event_trigger_for_drops()
+                RETURNS event_trigger LANGUAGE plpgsql AS $$
+        DECLARE
+            obj record;
+        BEGIN
+            FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects()
+            LOOP
+                RAISE NOTICE '% dropped object: % %.% %',
+                            tg_tag,
+                            obj.object_type,
+                            obj.schema_name,
+                            obj.object_name,
+                            obj.object_identity;
+            END LOOP;
+        END
+        $$;
+
+        CREATE EVENT TRIGGER test_event_trigger_for_drops
+        ON sql_drop
+        EXECUTE PROCEDURE test_event_trigger_for_drops();
+    """)
+
+    pg_port = port_distributor.get_port()
+    p = fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr())
+    assert p.returncode == 0
+
+    vanilla_pg.stop()
+
+    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
+    with VanillaPostgres(
+        fast_import.workdir / "pgdata", pgbin, pg_port, False
+    ) as new_pgdata_vanilla_pg:
+        new_pgdata_vanilla_pg.start()
+
+        # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres
+        conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb")
+        res = conn.safe_psql("SELECT count(*) FROM pg_event_trigger;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 0, f"Neon does not support importing event triggers, got: {res[0][0]}"
+
+
 def test_fast_import_restore_to_connstring(
     test_output_dir,
     vanilla_pg: VanillaPostgres,
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c8dab02bfc..108856a4ae 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194
+Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index eab3a37834..b763ab54b9 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66
+Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 74a6ff33d7..4307fd1c3f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.4",
-    "eab3a37834cac6ec0719bf817ac918a201712d66"
+    "b763ab54b98d232a0959371ab1d07f06ed77c49e"
   ],
   "v16": [
     "16.8",
@@ -13,6 +13,6 @@
   ],
   "v14": [
     "14.17",
-    "c8dab02bfc003ae7bd59096919042d7840f3c194"
+    "108856a4ae76be285b04497a0ed08fcbe60ddbe9"
   ]
 }

From 42d93031a13b31cee2fbb8c2e7f1b094b0f554a2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 13:48:29 +0200
Subject: [PATCH 71/76] fixup(#11819): broken macOS build (#11861)

refs
- fixes https://github.com/neondatabase/neon/issues/11860
---
 pageserver/src/virtual_file/io_engine.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index d8eb803335..7827682498 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -13,7 +13,7 @@
 pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::IoBuf;
-use tracing::{Instrument, info};
+use tracing::Instrument;
 
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
@@ -111,7 +111,8 @@ pub(crate) fn get() -> IoEngine {
 
 use std::os::unix::prelude::FileExt;
 use std::sync::atomic::{AtomicU8, Ordering};
-use std::time::Duration;
+#[cfg(target_os = "linux")]
+use {std::time::Duration, tracing::info};
 
 use super::owned_buffers_io::io_buf_ext::FullSlice;
 use super::owned_buffers_io::slice::SliceMutExt;
@@ -309,6 +310,7 @@ impl IoEngine {
 ///
 /// This function retries the operation once if it fails with ECANCELED.
 /// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations.
+#[cfg(target_os = "linux")]
 pub(super) async fn retry_ecanceled_once<F, Fut, T, V>(
     resources: T,
     f: F,

From 659366060dcef08a46c42c0794a829afb4270b1c Mon Sep 17 00:00:00 2001
From: Santosh Pingale <3813695+santosh-d3vpl3x@users.noreply.github.com>
Date: Thu, 8 May 2025 16:09:15 +0200
Subject: [PATCH 72/76] Reuse remote_client from the SnapshotDownloader instead
 of recreating in download function (#11812)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem
At the moment, remote_client and target are recreated in download
function. We could reuse it from SnapshotDownloader instance. This isn't
a problem per se, just a quality of life improvement but it caught my
attention when we were trying out snapshot downloading in one of the
older version and ran into a curious case of s3 clients behaving in two
different manners. One client that used `force_path_style` and other one
didn't.

**Logs from this run:**
```
2025-05-02T12:56:22.384626Z DEBUG /data/snappie/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001 requires download...
2025-05-02T12:56:22.384689Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:apply_configuration: timeout settings for this operation: TimeoutConfig { connect_timeout: Set(3.1s), read_timeout: Disabled, operation_timeout: Disabled, operation_attempt_timeout: Disabled }
2025-05-02T12:56:22.384730Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'serialization' phase
2025-05-02T12:56:22.384784Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'before transmit' phase
2025-05-02T12:56:22.384813Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: retry strategy has OKed initial request
2025-05-02T12:56:22.384841Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: beginning attempt #1
2025-05-02T12:56:22.384870Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: resolving endpoint endpoint_params=EndpointResolverParams(TypeErasedBox[!Clone]:Params { bucket: Some("bucket"), region: Some("eu-north-1"), use_fips: false, use_dual_stack: false, endpoint: Some("https://s3.self-hosted.company.com"), force_path_style: false, accelerate: false, use_global_endpoint: false, use_object_lambda_endpoint: None, key: None, prefix: Some("/pageserver/tenants/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001"), copy_source: None, disable_access_points: None, disable_multi_region_access_points: false, use_arn_region: None, use_s3_express_control_endpoint: None, disable_s3_express_session_auth: None }) endpoint_prefix=None
2025-05-02T12:56:22.384979Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: will use endpoint Endpoint { url: "https://neon.s3.self-hosted.company.com", headers: {}, properties: {"authSchemes": Array([Object({"signingRegion": String("eu-north-1"), "disableDoubleEncoding": Bool(true), "name": String("sigv4"), "signingName": String("s3")})])} }
2025-05-02T12:56:22.385042Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity:provide_credentials{provider=default_chain}: loaded credentials provider=Environment
2025-05-02T12:56:22.385066Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity: identity cache miss occurred; added new identity (took 35.958µs) new_expiration=2025-05-02T13:11:22.385028Z valid_for=899.999961437s partition=IdentityCachePartition(5)
2025-05-02T12:56:22.385090Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: loaded identity
2025-05-02T12:56:22.385162Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: entering 'transmit' phase
2025-05-02T12:56:22.385211Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: new TCP connector created in 361ns
2025-05-02T12:56:22.385288Z DEBUG resolving host="neon.s3.self-hosted.company.com"
2025-05-02T12:56:22.390796Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: encountered orchestrator error; halting
```
---
 storage_scrubber/src/tenant_snapshot.rs | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 24231e32fc..d0ca53f8ab 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -24,7 +24,6 @@ pub struct SnapshotDownloader {
     remote_client: GenericRemoteStorage,
     #[allow(dead_code)]
     target: RootTarget,
-    bucket_config: BucketConfig,
     tenant_id: TenantId,
     output_path: Utf8PathBuf,
     concurrency: usize,
@@ -43,7 +42,6 @@ impl SnapshotDownloader {
         Ok(Self {
             remote_client,
             target,
-            bucket_config,
             tenant_id,
             output_path,
             concurrency,
@@ -218,11 +216,9 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (remote_client, target) =
-            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
-
         // Generate a stream of TenantShardId
-        let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
+        let shards =
+            stream_tenant_shards(&self.remote_client, &self.target, self.tenant_id).await?;
         let shards: Vec<TenantShardId> = shards.try_collect().await?;
 
         // Only read from shards that have the highest count: avoids redundantly downloading
@@ -240,7 +236,8 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?;
+            let timelines =
+                stream_tenant_timelines(&self.remote_client, &self.target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
@@ -251,8 +248,8 @@ impl SnapshotDownloader {
                 let data = list_timeline_blobs(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
-            let timelines =
-                timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid));
+            let timelines = timelines
+                .map_ok(|ttid| load_timeline_index(&self.remote_client, &self.target, ttid));
             let mut timelines = std::pin::pin!(timelines.try_buffered(8));
 
             while let Some(i) = timelines.next().await {

From 622b3b29936d0496808396e447e678177a58412d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 8 May 2025 17:13:11 +0200
Subject: [PATCH 73/76] Fixes for enabling --timelines-onto-safekeepers in
 tests (#11854)

Second PR with fixes extracted from #11712, relating to
`--timelines-onto-safekeepers`. Does the following:

* Moves safekeeper registration to `neon_local` instead of the test
fixtures
* Pass safekeeper JWT token if `--timelines-onto-safekeepers` is enabled
* Allow some warnings related to offline safekeepers (similarly to how
we allow them for offline pageservers)
* Enable generations on the compute's config if
`--timelines-onto-safekeepers` is enabled
* fix parallel `pull_timeline` race condition (the one that #11786 put
for later)

Fixes #11424
Part of #11670
---
 control_plane/src/bin/neon_local.rs           |   9 +-
 control_plane/src/storage_controller.rs       | 100 ++++++++++++++++--
 safekeeper/src/http/routes.rs                 |   3 +-
 safekeeper/src/pull_timeline.rs               |  30 ++++--
 test_runner/fixtures/neon_fixtures.py         |  24 -----
 .../fixtures/pageserver/allowed_errors.py     |   4 +
 6 files changed, 131 insertions(+), 39 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 610fa5f865..191a22f1de 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1417,7 +1417,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             let pageserver_id = args.endpoint_pageserver_id;
             let remote_ext_base_url = &args.remote_ext_base_url;
 
-            let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
+            let default_generation = env
+                .storage_controller
+                .timelines_onto_safekeepers
+                .then_some(1);
+            let safekeepers_generation = args
+                .safekeepers_generation
+                .or(default_generation)
+                .map(SafekeeperGeneration::new);
             // If --safekeepers argument is given, use only the listed
             // safekeeper nodes; otherwise all from the env.
             let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index a36815d27e..755d67a7ad 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hyper0::Uri;
 use nix::unistd::Pid;
 use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
+    NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest,
+    SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest,
     TenantCreateResponse, TenantLocateResponse,
 };
 use pageserver_api::models::{
@@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use pem::Pem;
 use postgres_backend::AuthType;
-use reqwest::Method;
+use reqwest::{Method, Response};
 use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use tokio::process::Command;
@@ -570,6 +571,11 @@ impl StorageController {
             let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
                 .expect("failed to generate jwt token");
             args.push(format!("--peer-jwt-token={peer_jwt_token}"));
+
+            let claims = Claims::new(None, Scope::SafekeeperData);
+            let jwt_token =
+                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
+            args.push(format!("--safekeeper-jwt-token={jwt_token}"));
         }
 
         if let Some(public_key) = &self.public_key {
@@ -614,6 +620,10 @@ impl StorageController {
             self.env.base_data_dir.display()
         ));
 
+        if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() {
+            anyhow::bail!("Safekeeper set up for auth but no private key specified");
+        }
+
         if self.config.timelines_onto_safekeepers {
             args.push("--timelines-onto-safekeepers".to_string());
         }
@@ -640,6 +650,10 @@ impl StorageController {
         )
         .await?;
 
+        if self.config.timelines_onto_safekeepers {
+            self.register_safekeepers().await?;
+        }
+
         Ok(())
     }
 
@@ -743,6 +757,23 @@ impl StorageController {
     where
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
+    {
+        let response = self.dispatch_inner(method, path, body).await?;
+        Ok(response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    async fn dispatch_inner<RQ>(
+        &self,
+        method: reqwest::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> anyhow::Result<Response>
+    where
+        RQ: Serialize + Sized,
     {
         // In the special case of the `storage_controller start` subcommand, we wish
         // to use the API endpoint of the newly started storage controller in order
@@ -785,10 +816,31 @@ impl StorageController {
         let response = builder.send().await?;
         let response = response.error_from_body().await?;
 
-        Ok(response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+        Ok(response)
+    }
+
+    /// Register the safekeepers in the storage controller
+    #[instrument(skip(self))]
+    async fn register_safekeepers(&self) -> anyhow::Result<()> {
+        for sk in self.env.safekeepers.iter() {
+            let sk_id = sk.id;
+            let body = serde_json::json!({
+                "id": sk_id,
+                "created_at": "2023-10-25T09:11:25Z",
+                "updated_at": "2024-08-28T11:32:43Z",
+                "region_id": "aws-us-east-2",
+                "host": "127.0.0.1",
+                "port": sk.pg_port,
+                "http_port": sk.http_port,
+                "https_port": sk.https_port,
+                "version": 5957,
+                "availability_zone_id": format!("us-east-2b-{sk_id}"),
+            });
+            self.upsert_safekeeper(sk_id, body).await?;
+            self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active)
+                .await?;
+        }
+        Ok(())
     }
 
     /// Call into the attach_hook API, for use before handing out attachments to pageservers
@@ -816,6 +868,42 @@ impl StorageController {
         Ok(response.generation)
     }
 
+    #[instrument(skip(self))]
+    pub async fn upsert_safekeeper(
+        &self,
+        node_id: NodeId,
+        request: serde_json::Value,
+    ) -> anyhow::Result<()> {
+        let resp = self
+            .dispatch_inner::<serde_json::Value>(
+                Method::POST,
+                format!("control/v1/safekeeper/{node_id}"),
+                Some(request),
+            )
+            .await?;
+        if !resp.status().is_success() {
+            anyhow::bail!(
+                "setting scheduling policy unsuccessful for safekeeper {node_id}: {}",
+                resp.status()
+            );
+        }
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    pub async fn safekeeper_scheduling_policy(
+        &self,
+        node_id: NodeId,
+        scheduling_policy: SkSchedulingPolicy,
+    ) -> anyhow::Result<()> {
+        self.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
+            Method::POST,
+            format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
+            Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
+        )
+        .await
+    }
+
     #[instrument(skip(self))]
     pub async fn inspect(
         &self,
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 2b2d721db2..1a25b07496 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -243,8 +243,7 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
 
     let resp =
         pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+            .await?;
     json_response(StatusCode::OK, resp)
 }
 
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 1510a51019..66f2877cc5 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -7,6 +7,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
+use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use reqwest::Certificate;
 use safekeeper_api::Term;
@@ -30,7 +31,7 @@ use utils::pausable_failpoint;
 
 use crate::control_file::CONTROL_FILE_NAME;
 use crate::state::{EvictionState, TimelinePersistentState};
-use crate::timeline::{Timeline, WalResidentTimeline};
+use crate::timeline::{Timeline, TimelineError, WalResidentTimeline};
 use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline};
 use crate::wal_storage::open_wal_file;
 use crate::{GlobalTimelines, debug_dump, wal_backup};
@@ -395,7 +396,7 @@ pub async fn handle_request(
     sk_auth_token: Option<SecretString>,
     ssl_ca_certs: Vec<Certificate>,
     global_timelines: Arc<GlobalTimelines>,
-) -> Result<PullTimelineResponse> {
+) -> Result<PullTimelineResponse, ApiError> {
     let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
         request.timeline_id,
@@ -411,7 +412,9 @@ pub async fn handle_request(
     for ssl_ca_cert in ssl_ca_certs {
         http_client = http_client.add_root_certificate(ssl_ca_cert);
     }
-    let http_client = http_client.build()?;
+    let http_client = http_client
+        .build()
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
     let http_hosts = request.http_hosts.clone();
 
@@ -443,10 +446,10 @@ pub async fn handle_request(
     // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
     let min_required_successful = (http_hosts.len() - 1).max(1);
     if statuses.len() < min_required_successful {
-        bail!(
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
             "only got {} successful status responses. required: {min_required_successful}",
             statuses.len()
-        )
+        )));
     }
 
     // Find the most advanced safekeeper
@@ -465,7 +468,7 @@ pub async fn handle_request(
     assert!(status.tenant_id == request.tenant_id);
     assert!(status.timeline_id == request.timeline_id);
 
-    pull_timeline(
+    match pull_timeline(
         status,
         safekeeper_host,
         sk_auth_token,
@@ -473,6 +476,21 @@ pub async fn handle_request(
         global_timelines,
     )
     .await
+    {
+        Ok(resp) => Ok(resp),
+        Err(e) => {
+            match e.downcast_ref::<TimelineError>() {
+                Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse {
+                    safekeeper_host: None,
+                }),
+                Some(TimelineError::CreationInProgress(_)) => {
+                    // We don't return success here because creation might still fail.
+                    Err(ApiError::Conflict("Creation in progress".to_owned()))
+                }
+                _ => Err(ApiError::InternalServerError(e)),
+            }
+        }
+    }
 }
 
 async fn pull_timeline(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 370eca5130..547c640a40 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1409,30 +1409,6 @@ class NeonEnv:
         for f in futs:
             f.result()
 
-        # Last step: register safekeepers at the storage controller
-        if (
-            self.storage_controller_config is not None
-            and self.storage_controller_config.get("timelines_onto_safekeepers") is True
-        ):
-            for sk_id, sk in enumerate(self.safekeepers):
-                # 0 is an invalid safekeeper id
-                sk_id = sk_id + 1
-                body = {
-                    "id": sk_id,
-                    "created_at": "2023-10-25T09:11:25Z",
-                    "updated_at": "2024-08-28T11:32:43Z",
-                    "region_id": "aws-us-east-2",
-                    "host": "127.0.0.1",
-                    "port": sk.port.pg,
-                    "http_port": sk.port.http,
-                    "https_port": None,
-                    "version": 5957,
-                    "availability_zone_id": f"us-east-2b-{sk_id}",
-                }
-
-                self.storage_controller.on_safekeeper_deploy(sk_id, body)
-                self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active")
-
         self.endpoint_storage.start(timeout_in_seconds=timeout_in_seconds)
 
     def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 24c856e279..43bffd919c 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -122,6 +122,10 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     ".*Call to node.*management API.*failed.*Timeout.*",
     ".*Failed to update node .+ after heartbeat round.*error sending request for url.*",
     ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*",
+    # Many tests will take safekeepers offline
+    ".*Call to safekeeper.*management API.*failed.*receive body.*",
+    ".*Call to safekeeper.*management API.*failed.*ReceiveBody.*",
+    ".*Call to safekeeper.*management API.*failed.*Timeout.*",
     # Many tests will start up with a node offline
     ".*startup_reconcile: Could not scan node.*",
     # Tests run in dev mode

From 8477d15f95ffb094c444e658bbcdb95301b1a750 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 18:11:45 +0200
Subject: [PATCH 74/76] feat(direct IO): remove special case in test suite for
 compat tests (#11864)

PR
- https://github.com/neondatabase/neon/pull/11558
adds special treatment for compat snapshot binaries which don't
understand the `direct-rw` mode.

A new compat snapshot has been published since, so,
we can remove the special case.

refs:
- fixes https://github.com/neondatabase/neon/issues/11598
---
 test_runner/fixtures/neon_fixtures.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 547c640a40..aa468d9386 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1299,13 +1299,6 @@ class NeonEnv:
                         for key, value in override.items():
                             ps_cfg[key] = value
 
-            if self.pageserver_virtual_file_io_mode is not None:
-                # TODO(christian): https://github.com/neondatabase/neon/issues/11598
-                if not config.test_may_use_compatibility_snapshot_binaries:
-                    ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
-                else:
-                    log.info("ignoring virtual_file_io_mode parametrization for compatibility test")
-
             if self.pageserver_wal_receiver_protocol is not None:
                 key, value = PageserverWalReceiverProtocol.to_config_key_value(
                     self.pageserver_wal_receiver_protocol

From bef5954fd7b8ea43cac6f43a111d437cd7a360ad Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 8 May 2025 17:46:57 +0100
Subject: [PATCH 75/76] feat(proxy): track SNI usage by protocol, including for
 http (#11863)

## Problem

We want to see how many users of the legacy serverless driver are still
using the old URL for SQL-over-HTTP traffic.

## Summary of changes

Adds a protocol field to the connections_by_sni metric. Ensures it's
incremented for sql-over-http.
---
 proxy/src/auth/credentials.rs         | 29 ++++++++++++++-------------
 proxy/src/metrics.rs                  | 15 +++++++++++---
 proxy/src/serverless/mod.rs           |  1 +
 proxy/src/serverless/sql_over_http.rs | 28 +++++++++++++++++++++++++-
 test_runner/fixtures/neon_fixtures.py |  8 ++++----
 5 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 183976374a..526d0df7f2 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -12,9 +12,9 @@ use tracing::{debug, warn};
 use crate::auth::password_hack::parse_endpoint_param;
 use crate::context::RequestContext;
 use crate::error::{ReportableError, UserFacingError};
-use crate::metrics::{Metrics, SniKind};
+use crate::metrics::{Metrics, SniGroup, SniKind};
 use crate::proxy::NeonOptions;
-use crate::serverless::SERVERLESS_DRIVER_SNI;
+use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI};
 use crate::types::{EndpointId, RoleName};
 
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
@@ -65,7 +65,7 @@ pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet<String>) -> Option<
     if !common_names.contains(common_name) {
         return None;
     }
-    if subdomain == SERVERLESS_DRIVER_SNI {
+    if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI {
         return None;
     }
     Some(EndpointId::from(subdomain))
@@ -128,22 +128,23 @@ impl ComputeUserInfoMaybeEndpoint {
 
         let metrics = Metrics::get();
         debug!(%user, "credentials");
-        if sni.is_some() {
+
+        let protocol = ctx.protocol();
+        let kind = if sni.is_some() {
             debug!("Connection with sni");
-            metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
+            SniKind::Sni
         } else if endpoint.is_some() {
-            metrics
-                .proxy
-                .accepted_connections_by_sni
-                .inc(SniKind::NoSni);
             debug!("Connection without sni");
+            SniKind::NoSni
         } else {
-            metrics
-                .proxy
-                .accepted_connections_by_sni
-                .inc(SniKind::PasswordHack);
             debug!("Connection with password hack");
-        }
+            SniKind::PasswordHack
+        };
+
+        metrics
+            .proxy
+            .accepted_connections_by_sni
+            .inc(SniGroup { protocol, kind });
 
         let options = NeonOptions::parse_params(params);
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index e5fc0b724b..4b22c912eb 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -115,8 +115,8 @@ pub struct ProxyMetrics {
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
     pub allowed_vpc_endpoint_ids: Histogram<10>,
 
-    /// Number of connections (per sni).
-    pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
+    /// Number of connections, by the method we used to determine the endpoint.
+    pub accepted_connections_by_sni: CounterVec<SniSet>,
 
     /// Number of connection failures (per kind).
     pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,
@@ -342,11 +342,20 @@ pub enum LatencyExclusions {
     ClientCplaneComputeRetry,
 }
 
+#[derive(LabelGroup)]
+#[label(set = SniSet)]
+pub struct SniGroup {
+    pub protocol: Protocol,
+    pub kind: SniKind,
+}
+
 #[derive(FixedCardinalityLabel, Copy, Clone)]
-#[label(singleton = "kind")]
 pub enum SniKind {
+    /// Domain name based routing. SNI for libpq/websockets. Host for HTTP
     Sni,
+    /// Metadata based routing. `options` for libpq/websockets. Header for HTTP
     NoSni,
+    /// Metadata based routing, using the password field.
     PasswordHack,
 }
 
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 6f24ad3dec..2a7069b1c2 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -56,6 +56,7 @@ use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
 pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
+pub(crate) const AUTH_BROKER_SNI: &str = "apiauth";
 
 pub async fn task_main(
     config: &'static ProxyConfig,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index fee5942b7e..dfaeedaeae 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -38,7 +38,7 @@ use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::http::{ReadBodyError, read_body_with_limit};
-use crate::metrics::{HttpDirection, Metrics};
+use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind};
 use crate::proxy::{NeonOptions, run_until_cancelled};
 use crate::serverless::backend::HttpConnError;
 use crate::types::{DbName, RoleName};
@@ -227,6 +227,32 @@ fn get_conn_info(
         }
     }
 
+    // check the URL that was used, for metrics
+    {
+        let host_endpoint = headers
+            // get the host header
+            .get("host")
+            // extract the domain
+            .and_then(|h| {
+                let (host, _port) = h.to_str().ok()?.split_once(':')?;
+                Some(host)
+            })
+            // get the endpoint prefix
+            .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix));
+
+        let kind = if host_endpoint == Some(&*endpoint) {
+            SniKind::Sni
+        } else {
+            SniKind::NoSni
+        };
+
+        let protocol = ctx.protocol();
+        Metrics::get()
+            .proxy
+            .accepted_connections_by_sni
+            .inc(SniGroup { protocol, kind });
+    }
+
     ctx.set_user_agent(
         headers
             .get(hyper::header::USER_AGENT)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index aa468d9386..1b4562c0b3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3835,7 +3835,7 @@ class NeonAuthBroker:
         external_http_port: int,
         auth_backend: NeonAuthBroker.ProxyV1,
     ):
-        self.domain = "apiauth.local.neon.build"  # resolves to 127.0.0.1
+        self.domain = "local.neon.build"  # resolves to 127.0.0.1
         self.host = "127.0.0.1"
         self.http_port = http_port
         self.external_http_port = external_http_port
@@ -3852,7 +3852,7 @@ class NeonAuthBroker:
         # generate key of it doesn't exist
         crt_path = self.test_output_dir / "proxy.crt"
         key_path = self.test_output_dir / "proxy.key"
-        generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path)
+        generate_proxy_tls_certs(f"apiauth.{self.domain}", key_path, crt_path)
 
         args = [
             str(self.neon_binpath / "proxy"),
@@ -3896,10 +3896,10 @@ class NeonAuthBroker:
 
         log.info(f"Executing http query: {query}")
 
-        connstr = f"postgresql://{user}@{self.domain}/postgres"
+        connstr = f"postgresql://{user}@ep-foo-bar-1234.{self.domain}/postgres"
         async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client:
             response = await client.post(
-                f"https://{self.domain}:{self.external_http_port}/sql",
+                f"https://apiauth.{self.domain}:{self.external_http_port}/sql",
                 json={"query": query, "params": args},
                 headers={
                     "Neon-Connection-String": connstr,

From b37bb7d7edaab870d05bff7286e345066d49664e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 20:48:24 +0200
Subject: [PATCH 76/76] pageserver: timeline shutdown: fully quiesce ingest
 path before`freeze_and_flush` (#11851)

# Problem

Before this PR, timeline shutdown would
- cancel the walreceiver cancellation token subtree (child token of
Timeline::cancel)
- call freeze_and_flush
- Timeline::cancel.cancel()
- ... bunch of waiting for things ...
- Timeline::gate.close()

As noted by the comment that is deleted by this PR, this left a window
where, after freeze_and_flush, walreceiver could still be running and
ingest data into a new InMemoryLayer.

This presents a potential source of log noise during Timeline shutdown
where the InMemoryLayer created after the freeze_and_flush observes
that Timeline::cancel is cancelled, failing the ingest with some
anyhow::Error wrapping (deeply) a `FlushTaskError::Cancelled` instance
(`flush task cancelled` error message).

# Solution

It turns out that it is quite easy to shut down, not just cancel,
walreceiver completely
because the only subtask spawned by walreceiver connection manager is
the `handle_walreceiver_connection` task, which is properly shut down
and waited upon when the manager task observes cancellation and exits
its retry loop.

The alternative is to replace all the usage of `anyhow` on the ingest
path
with differentiated error types. A lot of busywork for little gain to
fix
a potential logging noise nuisance, so, not doing that for now.

# Correctness / Risk

We do not risk leaking walreceiver child tasks because existing
discipline
is to hold a gate guard.

We will prolong `Timeline::shutdown` to the degree that we're no longer
making
progress with the rest of shutdown while the walreceiver task hasn't yet
observed cancellation. In practice, this should be negligible.

`Timeline::shutdown` could fail to complete if there is a hidden
dependency
of walreceiver shutdown on some subsystem. The code certainly suggests
there
isn't, and I'm not aware of any such dependency. Anyway, impact will be
low
because we only shut down Timeline instances that are obsolete, either
because
there is a newer attachment at a different location, or because the
timeline
got deleted by the user. We would learn about this through stuck cplane
operations or stuck storcon reconciliations. We would be able to
mitigate by
cancelling such stuck operations/reconciliations and/or by rolling back
pageserver.

# Refs
- identified this while investigating
https://github.com/neondatabase/neon/issues/11762
- PR that _does_ fix a bunch _real_ `flush task cancelled` noise on the
compaction path: https://github.com/neondatabase/neon/pull/11853
---
 pageserver/src/tenant/timeline.rs             | 12 ++----------
 pageserver/src/tenant/timeline/walreceiver.rs | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c8d897d074..d7f5958128 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2127,22 +2127,14 @@ impl Timeline {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         // Regardless of whether we're going to try_freeze_and_flush
-        // or not, stop ingesting any more data. Walreceiver only provides
-        // cancellation but no "wait until gone", because it uses the Timeline::gate.
-        // So, only after the self.gate.close() below will we know for sure that
-        // no walreceiver tasks are left.
-        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
-        // data during the call to `self.freeze_and_flush()` below.
-        // That's not ideal, but, we don't have the concept of a ChildGuard,
-        // which is what we'd need to properly model early shutdown of the walreceiver
-        // task sub-tree before the other Timeline task sub-trees.
+        // or not, stop ingesting any more data.
         let walreceiver = self.walreceiver.lock().unwrap().take();
         tracing::debug!(
             is_some = walreceiver.is_some(),
             "Waiting for WalReceiverManager..."
         );
         if let Some(walreceiver) = walreceiver {
-            walreceiver.cancel();
+            walreceiver.shutdown().await;
         }
         // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 4f80073cc3..0f73eb839b 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -63,6 +63,7 @@ pub struct WalReceiver {
     /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
     /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
     cancel: CancellationToken,
+    task: tokio::task::JoinHandle<()>,
 }
 
 impl WalReceiver {
@@ -79,7 +80,7 @@ impl WalReceiver {
         let loop_status = Arc::new(std::sync::RwLock::new(None));
         let manager_status = Arc::clone(&loop_status);
         let cancel = timeline.cancel.child_token();
-        WALRECEIVER_RUNTIME.spawn({
+        let task = WALRECEIVER_RUNTIME.spawn({
             let cancel = cancel.clone();
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
@@ -120,14 +121,25 @@ impl WalReceiver {
         Self {
             manager_status,
             cancel,
+            task,
         }
     }
 
     #[instrument(skip_all, level = tracing::Level::DEBUG)]
-    pub fn cancel(&self) {
+    pub async fn shutdown(self) {
         debug_assert_current_span_has_tenant_and_timeline_id();
         debug!("cancelling walreceiver tasks");
         self.cancel.cancel();
+        match self.task.await {
+            Ok(()) => debug!("Shutdown success"),
+            Err(je) if je.is_cancelled() => unreachable!("not used"),
+            Err(je) if je.is_panic() => {
+                // already logged by panic hook
+            }
+            Err(je) => {
+                error!("shutdown walreceiver task join error: {je}")
+            }
+        }
     }
 
     pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {