From 725e0a1ac9d1b409c57d2e7d87ac16d9c3d9f91b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 19 Nov 2024 23:03:15 +0000
Subject: [PATCH 01/51] CI(release): create reusable workflow for releases
 (#9806)

## Problem

We have a bunch of duplicated code for automated releases. There will be
even more, once we have `release-compute` branch
(https://github.com/neondatabase/neon/pull/9637).

Another issue with the current `release` workflow is that it creates a
PR from the main as is. If we create 2 different releases from the
same commit, GitHub could mix up results from different PRs.

## Summary of changes
- Create a reusable workflow for releases
- Create an empty commit to differentiate releases
---
 .github/workflows/_create-release-pr.yml | 79 ++++++++++++++++++++++
 .github/workflows/release.yml            | 84 ++++--------------------
 2 files changed, 93 insertions(+), 70 deletions(-)
 create mode 100644 .github/workflows/_create-release-pr.yml

diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml
new file mode 100644
index 0000000000..cc6994397f
--- /dev/null
+++ b/.github/workflows/_create-release-pr.yml
@@ -0,0 +1,79 @@
+name: Create Release PR
+
+on:
+  workflow_call:
+    inputs:
+      component-name:
+        description: 'Component name'
+        required: true
+        type: string
+      release-branch:
+        description: 'Release branch'
+        required: true
+        type: string
+    secrets:
+      ci-access-token:
+        description: 'CI access token'
+        required: true
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+jobs:
+  create-storage-release-branch:
+    runs-on: ubuntu-22.04
+
+    permissions:
+      contents: write # for `git push`
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        ref: main
+
+    - name: Set variables
+      id: vars
+      env:
+        COMPONENT_NAME: ${{ inputs.component-name }}
+        RELEASE_BRANCH: ${{ inputs.release-branch }}
+      run: |
+        today=$(date +'%Y-%m-%d')
+        echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
+        echo "rc-branch=rc/${RELEASE_BRANCH}/${today}"  | tee -a ${GITHUB_OUTPUT}
+
+    - name: Configure git
+      run: |
+        git config user.name "github-actions[bot]"
+        git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+
+    - name: Create RC branch
+      env:
+        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
+        TITLE: ${{ steps.vars.outputs.title }}
+      run: |
+        git checkout -b "${RC_BRANCH}"
+
+        # create an empty commit to distinguish workflow runs
+        # from other possible releases from the same commit
+        git commit --allow-empty -m "${TITLE}"
+
+        git push origin "${RC_BRANCH}"
+
+    - name: Create a PR into ${{ inputs.release-branch }}
+      env:
+        GH_TOKEN: ${{ secrets.ci-access-token }}
+        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
+        RELEASE_BRANCH: ${{ inputs.release-branch }}
+        TITLE: ${{ steps.vars.outputs.title }}
+      run: |
+        cat << EOF > body.md
+          ## ${TITLE}
+
+          **Please merge this Pull Request using 'Create a merge commit' button**
+        EOF
+
+        gh pr create --title "${TITLE}" \
+                     --body-file "body.md" \
+                     --head "${RC_BRANCH}" \
+                     --base "${RELEASE_BRANCH}"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 56ef6f4bbb..11f010b6d4 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,82 +26,26 @@ defaults:
 jobs:
   create-storage-release-branch:
     if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
 
     permissions:
-      contents: write # for `git push`
+      contents: write
 
-    steps:
-    - name: Check out code
-      uses: actions/checkout@v4
-      with:
-        ref: main
-
-    - name: Set environment variables
-      run: |
-        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-        echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-
-    - name: Create release branch
-      run: git checkout -b $RELEASE_BRANCH
-
-    - name: Push new branch
-      run: git push origin $RELEASE_BRANCH
-
-    - name: Create pull request into release
-      env:
-        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-      run: |
-        TITLE="Storage & Compute release ${RELEASE_DATE}"
-
-        cat << EOF > body.md
-          ## ${TITLE}
-
-          **Please merge this Pull Request using 'Create a merge commit' button**
-        EOF
-
-        gh pr create --title "${TITLE}" \
-                     --body-file "body.md" \
-                     --head "${RELEASE_BRANCH}" \
-                     --base "release"
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Storage & Compute'
+      release-branch: 'release'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
 
   create-proxy-release-branch:
     if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
 
     permissions:
-      contents: write # for `git push`
+      contents: write
 
-    steps:
-    - name: Check out code
-      uses: actions/checkout@v4
-      with:
-        ref: main
-
-    - name: Set environment variables
-      run: |
-        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-        echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-
-    - name: Create release branch
-      run: git checkout -b $RELEASE_BRANCH
-
-    - name: Push new branch
-      run: git push origin $RELEASE_BRANCH
-
-    - name: Create pull request into release
-      env:
-        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-      run: |
-        TITLE="Proxy release ${RELEASE_DATE}"
-
-        cat << EOF > body.md
-          ## ${TITLE}
-
-          **Please merge this Pull Request using 'Create a merge commit' button**
-        EOF
-
-        gh pr create --title "${TITLE}" \
-                     --body-file "body.md" \
-                     --head "${RELEASE_BRANCH}" \
-                     --base "release-proxy"
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Proxy'
+      release-branch: 'release-proxy'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}

From 2281a02c49fd396ef9b06fafa35028c33eea8b3d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 20 Nov 2024 00:30:24 +0000
Subject: [PATCH 02/51] CODEOWNERS: add developer-productivity team (#9810)

Notify @neondatabase/developer-productivity team about changes in CI
(i.e. in `.github/` directory)
---
 CODEOWNERS | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index f8ed4be816..21b0e7c51f 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,6 +1,5 @@
+/.github/ @neondatabase/developer-productivity
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/storage_controller @neondatabase/storage
-/storage_scrubber @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
 /libs/remote_storage/ @neondatabase/storage
@@ -11,4 +10,6 @@
 /pgxn/neon/ @neondatabase/compute @neondatabase/storage
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/storage
+/storage_controller @neondatabase/storage
+/storage_scrubber @neondatabase/storage
 /vendor/ @neondatabase/compute

From ea1858e3b66fa058ce8ddfb6f37b364154dd20a6 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm@gmail.com>
Date: Wed, 20 Nov 2024 02:14:58 +0100
Subject: [PATCH 03/51] compute_ctl: Streamline and Pipeline startup SQL
 (#9717)

Before, compute_ctl didn't have a good registry for what command would
run when, depending exclusively on sync code to apply changes. When
users have many databases/roles to manage, this step can take a
substantial amount of time, breaking assumptions about low (re)start
times in other systems.

This commit reduces the time compute_ctl takes to restart when changes
must be applied, by making all commands more or less blind writes, and
applying these commands in an asynchronous context, only waiting for
completion once we know the commands have all been sent.

Additionally, this reduces time spent by batching per-database
operations where previously we would create a new SQL connection for
every user-database operation we planned to execute.
---
 compute_tools/src/catalog.rs                  |  44 +-
 compute_tools/src/checker.rs                  |  28 -
 compute_tools/src/compute.rs                  | 397 ++++++++--
 compute_tools/src/lib.rs                      |   1 +
 compute_tools/src/pg_helpers.rs               |  39 +-
 compute_tools/src/spec.rs                     | 634 +---------------
 compute_tools/src/spec_apply.rs               | 680 ++++++++++++++++++
 .../src/sql/add_availabilitycheck_tables.sql  |  18 +
 .../src/sql/anon_ext_fn_reassign.sql          |  12 +
 compute_tools/src/sql/default_grants.sql      |  30 +
 .../src/sql/set_public_schema_owner.sql       |  23 +
 .../src/sql/unset_template_for_drop_dbs.sql   |  12 +
 12 files changed, 1146 insertions(+), 772 deletions(-)
 create mode 100644 compute_tools/src/spec_apply.rs
 create mode 100644 compute_tools/src/sql/add_availabilitycheck_tables.sql
 create mode 100644 compute_tools/src/sql/anon_ext_fn_reassign.sql
 create mode 100644 compute_tools/src/sql/default_grants.sql
 create mode 100644 compute_tools/src/sql/set_public_schema_owner.sql
 create mode 100644 compute_tools/src/sql/unset_template_for_drop_dbs.sql

diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 4fefa831e0..2f6f82dd39 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -1,38 +1,40 @@
-use compute_api::{
-    responses::CatalogObjects,
-    spec::{Database, Role},
-};
+use compute_api::responses::CatalogObjects;
 use futures::Stream;
-use postgres::{Client, NoTls};
+use postgres::NoTls;
 use std::{path::Path, process::Stdio, result::Result, sync::Arc};
 use tokio::{
     io::{AsyncBufReadExt, BufReader},
     process::Command,
-    task,
+    spawn,
 };
+use tokio_postgres::connect;
 use tokio_stream::{self as stream, StreamExt};
 use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
 
-use crate::{
-    compute::ComputeNode,
-    pg_helpers::{get_existing_dbs, get_existing_roles},
-};
+use crate::compute::ComputeNode;
+use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async};
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
     let connstr = compute.connstr.clone();
-    task::spawn_blocking(move || {
-        let mut client = Client::connect(connstr.as_str(), NoTls)?;
-        let roles: Vec<Role>;
-        {
-            let mut xact = client.transaction()?;
-            roles = get_existing_roles(&mut xact)?;
-        }
-        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
 
-        Ok(CatalogObjects { roles, databases })
-    })
-    .await?
+    let (client, connection): (tokio_postgres::Client, _) =
+        connect(connstr.as_str(), NoTls).await?;
+
+    spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let roles = get_existing_roles_async(&client).await?;
+
+    let databases = get_existing_dbs_async(&client)
+        .await?
+        .into_values()
+        .collect();
+
+    Ok(CatalogObjects { roles, databases })
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index d76eaad0a0..cec2b1bed8 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,37 +1,9 @@
 use anyhow::{anyhow, Ok, Result};
-use postgres::Client;
 use tokio_postgres::NoTls;
 use tracing::{error, instrument, warn};
 
 use crate::compute::ComputeNode;
 
-/// Create a special service table for availability checks
-/// only if it does not exist already.
-pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
-    let query = "
-        DO $$
-        BEGIN
-            IF NOT EXISTS(
-                SELECT 1
-                FROM pg_catalog.pg_tables
-                WHERE tablename = 'health_check'
-            )
-            THEN
-            CREATE TABLE health_check (
-                id serial primary key,
-                updated_at timestamptz default now()
-            );
-            INSERT INTO health_check VALUES (1, now())
-                ON CONFLICT (id) DO UPDATE
-                 SET updated_at = now();
-            END IF;
-        END
-        $$;";
-    client.execute(query, &[])?;
-
-    Ok(())
-}
-
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
 #[instrument(skip_all)]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0a8cb14058..4f67425ba8 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,20 +1,21 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::env;
 use std::fs;
+use std::iter::once;
 use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
 use std::sync::atomic::AtomicU32;
 use std::sync::atomic::Ordering;
-use std::sync::{Condvar, Mutex, RwLock};
+use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::thread;
 use std::time::Duration;
 use std::time::Instant;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use compute_api::spec::PgIdent;
+use compute_api::spec::{PgIdent, Role};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -31,15 +32,23 @@ use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion};
 use utils::measured_stream::MeasuredReader;
 
 use nix::sys::signal::{kill, Signal};
-
 use remote_storage::{DownloadError, RemotePath};
+use tokio::spawn;
+use url::Url;
 
-use crate::checker::create_availability_check_data;
 use crate::installed_extensions::get_installed_extensions_sync;
 use crate::local_proxy;
-use crate::logger::inlinify;
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::spec_apply::ApplySpecPhase::{
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser,
+    DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
+    RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
+};
+use crate::spec_apply::PerDatabasePhase::{
+    ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension,
+};
+use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};
 
@@ -224,10 +233,7 @@ fn maybe_cgexec(cmd: &str) -> Command {
     }
 }
 
-/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
-/// that we give to customers
-#[instrument(skip_all)]
-fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
     let roles = spec
         .cluster
         .roles
@@ -296,11 +302,8 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
             $$;"#,
         roles_decl, database_decl,
     );
-    info!("Neon superuser created: {}", inlinify(&query));
-    client
-        .simple_query(&query)
-        .map_err(|e| anyhow::anyhow!(e).context(query))?;
-    Ok(())
+
+    query
 }
 
 impl ComputeNode {
@@ -813,21 +816,14 @@ impl ComputeNode {
         Ok(())
     }
 
-    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip_all)]
-    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        // If connection fails,
-        // it may be the old node with `zenith_admin` superuser.
-        //
-        // In this case we need to connect with old `zenith_admin` name
-        // and create new user. We cannot simply rename connected user,
-        // but we can create a new one and grant it all privileges.
-        let mut connstr = self.connstr.clone();
+    async fn get_maintenance_client(url: &Url) -> Result<tokio_postgres::Client> {
+        let mut connstr = url.clone();
+
         connstr
             .query_pairs_mut()
             .append_pair("application_name", "apply_config");
 
-        let mut client = match Client::connect(connstr.as_str(), NoTls) {
+        let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await {
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
                 | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
@@ -845,8 +841,8 @@ impl ComputeNode {
                     let mut client =
                         Client::connect(zenith_admin_connstr.as_str(), NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
-                    // Disable forwarding so that users don't get a cloud_admin role
 
+                    // Disable forwarding so that users don't get a cloud_admin role
                     let mut func = || {
                         client.simple_query("SET neon.forward_ddl = false")?;
                         client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
@@ -858,49 +854,309 @@ impl ComputeNode {
                     drop(client);
 
                     // reconnect with connstring with expected name
-                    Client::connect(connstr.as_str(), NoTls)?
+                    tokio_postgres::connect(connstr.as_str(), NoTls).await?
                 }
                 _ => return Err(e.into()),
             },
-            Ok(client) => client,
+            Ok((client, conn)) => (client, conn),
         };
 
-        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        spawn(async move {
+            if let Err(e) = conn.await {
+                error!("maintenance client connection error: {}", e);
+            }
+        });
+
+        // Disable DDL forwarding because control plane already knows about the roles/databases
+        // we're about to modify.
         client
             .simple_query("SET neon.forward_ddl = false")
+            .await
             .context("apply_config SET neon.forward_ddl = false")?;
 
-        // Proceed with post-startup configuration. Note, that order of operations is important.
-        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
-        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
-        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
-        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)
-            .context("apply_config handle_role_deletions")?;
-        handle_grants(
-            spec,
-            &mut client,
-            connstr.as_str(),
-            self.has_feature(ComputeFeature::AnonExtension),
-        )
-        .context("apply_config handle_grants")?;
-        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
-        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
-        create_availability_check_data(&mut client)
-            .context("apply_config create_availability_check_data")?;
+        Ok(client)
+    }
 
-        // 'Close' connection
-        drop(client);
+    /// Apply the spec to the running PostgreSQL instance.
+    /// The caller can decide to run with multiple clients in parallel, or
+    /// single mode.  Either way, the commands executed will be the same, and
+    /// only commands run in different databases are parallelized.
+    #[instrument(skip_all)]
+    pub fn apply_spec_sql(
+        &self,
+        spec: Arc<ComputeSpec>,
+        url: Arc<Url>,
+        concurrency: usize,
+    ) -> Result<()> {
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()?;
 
-        if let Some(ref local_proxy) = spec.local_proxy_config {
+        info!("Applying config with max {} concurrency", concurrency);
+        debug!("Config: {:?}", spec);
+
+        rt.block_on(async {
+            // Proceed with post-startup configuration. Note, that order of operations is important.
+            let client = Self::get_maintenance_client(&url).await?;
+            let spec = spec.clone();
+
+            let databases = get_existing_dbs_async(&client).await?;
+            let roles = get_existing_roles_async(&client)
+                .await?
+                .into_iter()
+                .map(|role| (role.name.clone(), role))
+                .collect::<HashMap<String, Role>>();
+
+            let jwks_roles = Arc::new(
+                spec.as_ref()
+                    .local_proxy_config
+                    .iter()
+                    .flat_map(|it| &it.jwks)
+                    .flatten()
+                    .flat_map(|setting| &setting.role_names)
+                    .cloned()
+                    .collect::<HashSet<_>>(),
+            );
+
+            let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext {
+                roles,
+                dbs: databases,
+            }));
+
+            for phase in [
+                CreateSuperUser,
+                DropInvalidDatabases,
+                RenameRoles,
+                CreateAndAlterRoles,
+                RenameAndDeleteDatabases,
+                CreateAndAlterDatabases,
+            ] {
+                debug!("Applying phase {:?}", &phase);
+                apply_operations(
+                    spec.clone(),
+                    ctx.clone(),
+                    jwks_roles.clone(),
+                    phase,
+                    || async { Ok(&client) },
+                )
+                .await?;
+            }
+
+            let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
+
+            let db_processes = spec
+                .cluster
+                .databases
+                .iter()
+                .map(|db| DB::new(db.clone()))
+                // include
+                .chain(once(DB::SystemDB))
+                .map(|db| {
+                    let spec = spec.clone();
+                    let ctx = ctx.clone();
+                    let jwks_roles = jwks_roles.clone();
+                    let mut url = url.as_ref().clone();
+                    let concurrency_token = concurrency_token.clone();
+                    let db = db.clone();
+
+                    debug!("Applying per-database phases for Database {:?}", &db);
+
+                    match &db {
+                        DB::SystemDB => {}
+                        DB::UserDB(db) => {
+                            url.set_path(db.name.as_str());
+                        }
+                    }
+
+                    let url = Arc::new(url);
+                    let fut = Self::apply_spec_sql_db(
+                        spec.clone(),
+                        url,
+                        ctx.clone(),
+                        jwks_roles.clone(),
+                        concurrency_token.clone(),
+                        db,
+                    );
+
+                    Ok(spawn(fut))
+                })
+                .collect::<Vec<Result<_, anyhow::Error>>>();
+
+            for process in db_processes.into_iter() {
+                let handle = process?;
+                handle.await??;
+            }
+
+            for phase in vec![
+                HandleOtherExtensions,
+                HandleNeonExtension,
+                CreateAvailabilityCheck,
+                DropRoles,
+            ] {
+                debug!("Applying phase {:?}", &phase);
+                apply_operations(
+                    spec.clone(),
+                    ctx.clone(),
+                    jwks_roles.clone(),
+                    phase,
+                    || async { Ok(&client) },
+                )
+                .await?;
+            }
+
+            Ok::<(), anyhow::Error>(())
+        })?;
+
+        Ok(())
+    }
+
+    /// Apply SQL migrations of the RunInEachDatabase phase.
+    ///
+    /// May opt to not connect to databases that don't have any scheduled
+    /// operations.  The function is concurrency-controlled with the provided
+    /// semaphore.  The caller has to make sure the semaphore isn't exhausted.
+    async fn apply_spec_sql_db(
+        spec: Arc<ComputeSpec>,
+        url: Arc<Url>,
+        ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
+        jwks_roles: Arc<HashSet<String>>,
+        concurrency_token: Arc<tokio::sync::Semaphore>,
+        db: DB,
+    ) -> Result<()> {
+        let _permit = concurrency_token.acquire().await?;
+
+        let mut client_conn = None;
+
+        for subphase in [
+            DeleteDBRoleReferences,
+            ChangeSchemaPerms,
+            HandleAnonExtension,
+        ] {
+            apply_operations(
+                spec.clone(),
+                ctx.clone(),
+                jwks_roles.clone(),
+                RunInEachDatabase {
+                    db: db.clone(),
+                    subphase,
+                },
+                // Only connect if apply_operation actually wants a connection.
+                // It's quite possible this database doesn't need any queries,
+                // so by not connecting we save time and effort connecting to
+                // that database.
+                || async {
+                    if client_conn.is_none() {
+                        let db_client = Self::get_maintenance_client(&url).await?;
+                        client_conn.replace(db_client);
+                    }
+                    let client = client_conn.as_ref().unwrap();
+                    Ok(client)
+                },
+            )
+            .await?;
+        }
+
+        drop(client_conn);
+
+        Ok::<(), anyhow::Error>(())
+    }
+
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip_all)]
+    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
+        // If connection fails,
+        // it may be the old node with `zenith_admin` superuser.
+        //
+        // In this case we need to connect with old `zenith_admin` name
+        // and create new user. We cannot simply rename connected user,
+        // but we can create a new one and grant it all privileges.
+        let mut url = self.connstr.clone();
+        url.query_pairs_mut()
+            .append_pair("application_name", "apply_config");
+
+        let url = Arc::new(url);
+        let spec = Arc::new(
+            compute_state
+                .pspec
+                .as_ref()
+                .expect("spec must be set")
+                .spec
+                .clone(),
+        );
+
+        // Choose how many concurrent connections to use for applying the spec changes.
+        // If the cluster is not currently Running we don't have to deal with user connections,
+        // and can thus use all `max_connections` connection slots. However, that's generally not
+        // very efficient, so we generally still limit it to a smaller number.
+        let max_concurrent_connections = if compute_state.status != ComputeStatus::Running {
+            // If the settings contain 'max_connections', use that as template
+            if let Some(config) = spec.cluster.settings.find("max_connections") {
+                config.parse::<usize>().ok()
+            } else {
+                // Otherwise, try to find the setting in the postgresql_conf string
+                spec.cluster
+                    .postgresql_conf
+                    .iter()
+                    .flat_map(|conf| conf.split("\n"))
+                    .filter_map(|line| {
+                        if !line.contains("max_connections") {
+                            return None;
+                        }
+
+                        let (key, value) = line.split_once("=")?;
+                        let key = key
+                            .trim_start_matches(char::is_whitespace)
+                            .trim_end_matches(char::is_whitespace);
+
+                        let value = value
+                            .trim_start_matches(char::is_whitespace)
+                            .trim_end_matches(char::is_whitespace);
+
+                        if key != "max_connections" {
+                            return None;
+                        }
+
+                        value.parse::<usize>().ok()
+                    })
+                    .next()
+            }
+            // If max_connections is present, use at most 1/3rd of that.
+            // When max_connections is lower than 30, try to use at least 10 connections, but
+            // never more than max_connections.
+            .map(|limit| match limit {
+                0..10 => limit,
+                10..30 => 10,
+                30.. => limit / 3,
+            })
+            // If we didn't find max_connections, default to 10 concurrent connections.
+            .unwrap_or(10)
+        } else {
+            // state == Running
+            // Because the cluster is already in the Running state, we should assume users are
+            // already connected to the cluster, and high concurrency could negatively
+            // impact user connectivity. Therefore, we can limit concurrency to the number of
+            // reserved superuser connections, which users wouldn't be able to use anyway.
+            spec.cluster
+                .settings
+                .find("superuser_reserved_connections")
+                .iter()
+                .filter_map(|val| val.parse::<usize>().ok())
+                .map(|val| if val > 1 { val - 1 } else { 1 })
+                .last()
+                .unwrap_or(3)
+        };
+
+        // Merge-apply spec & changes to PostgreSQL state.
+        self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?;
+
+        if let Some(ref local_proxy) = &spec.clone().local_proxy_config {
             info!("configuring local_proxy");
             local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
         }
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
-            let mut connstr = connstr.clone();
+            let mut connstr = url.as_ref().clone();
             connstr
                 .query_pairs_mut()
                 .append_pair("application_name", "migrations");
@@ -908,7 +1164,8 @@ impl ComputeNode {
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });
-        Ok(())
+
+        Ok::<(), anyhow::Error>(())
     }
 
     // Wrapped this around `pg_ctl reload`, but right now we don't use
@@ -971,32 +1228,16 @@ impl ComputeNode {
         config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
             self.pg_reload_conf()?;
 
-            let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-
-            // Proceed with post-startup configuration. Note, that order of operations is important.
-            // Disable DDL forwarding because control plane already knows about these roles/databases.
             if spec.mode == ComputeMode::Primary {
-                client.simple_query("SET neon.forward_ddl = false")?;
-                cleanup_instance(&mut client)?;
-                handle_roles(&spec, &mut client)?;
-                handle_databases(&spec, &mut client)?;
-                handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-                handle_grants(
-                    &spec,
-                    &mut client,
-                    self.connstr.as_str(),
-                    self.has_feature(ComputeFeature::AnonExtension),
-                )?;
-                handle_extensions(&spec, &mut client)?;
-                handle_extension_neon(&mut client)?;
-                // We can skip handle_migrations here because a new migration can only appear
-                // if we have a new version of the compute_ctl binary, which can only happen
-                // if compute got restarted, in which case we'll end up inside of apply_config
-                // instead of reconfigure.
-            }
+                let mut url = self.connstr.clone();
+                url.query_pairs_mut()
+                    .append_pair("application_name", "apply_config");
+                let url = Arc::new(url);
 
-            // 'Close' connection
-            drop(client);
+                let spec = Arc::new(spec.clone());
+
+                self.apply_spec_sql(spec, url, 1)?;
+            }
 
             Ok(())
         })?;
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index d27ae58fa2..ee4cf2dfa5 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -23,5 +23,6 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+mod spec_apply;
 pub mod swap;
 pub mod sync_sk;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index b2dc265864..4a1e5ee0e8 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -10,9 +10,9 @@ use std::thread::JoinHandle;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Result};
+use futures::StreamExt;
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
-use postgres::{Client, Transaction};
 use tokio::io::AsyncBufReadExt;
 use tokio::time::timeout;
 use tokio_postgres::NoTls;
@@ -197,27 +197,34 @@ impl Escaping for PgIdent {
 }
 
 /// Build a list of existing Postgres roles
-pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
-    let postgres_roles = xact
-        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
-        .iter()
+pub async fn get_existing_roles_async(client: &tokio_postgres::Client) -> Result<Vec<Role>> {
+    let postgres_roles = client
+        .query_raw::<str, &String, &[String; 0]>(
+            "SELECT rolname, rolpassword FROM pg_catalog.pg_authid",
+            &[],
+        )
+        .await?
+        .filter_map(|row| async { row.ok() })
         .map(|row| Role {
             name: row.get("rolname"),
             encrypted_password: row.get("rolpassword"),
             options: None,
         })
-        .collect();
+        .collect()
+        .await;
 
     Ok(postgres_roles)
 }
 
 /// Build a list of existing Postgres databases
-pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
+pub async fn get_existing_dbs_async(
+    client: &tokio_postgres::Client,
+) -> Result<HashMap<String, Database>> {
     // `pg_database.datconnlimit = -2` means that the database is in the
     // invalid state. See:
     //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-    let postgres_dbs: Vec<Database> = client
-        .query(
+    let rowstream = client
+        .query_raw::<str, &String, &[String; 0]>(
             "SELECT
                 datname AS name,
                 datdba::regrole::text AS owner,
@@ -226,8 +233,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>
             FROM
                 pg_catalog.pg_database;",
             &[],
-        )?
-        .iter()
+        )
+        .await?;
+
+    let dbs_map = rowstream
+        .filter_map(|r| async { r.ok() })
         .map(|row| Database {
             name: row.get("name"),
             owner: row.get("owner"),
@@ -235,12 +245,9 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>
             invalid: row.get("invalid"),
             options: None,
         })
-        .collect();
-
-    let dbs_map = postgres_dbs
-        .iter()
         .map(|db| (db.name.clone(), db.clone()))
-        .collect::<HashMap<_, _>>();
+        .collect::<HashMap<_, _>>()
+        .await;
 
     Ok(dbs_map)
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 73f3d1006a..c7d2deb090 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,22 +1,17 @@
-use std::collections::HashSet;
+use anyhow::{anyhow, bail, Result};
+use postgres::Client;
+use reqwest::StatusCode;
 use std::fs::File;
 use std::path::Path;
-use std::str::FromStr;
-
-use anyhow::{anyhow, bail, Context, Result};
-use postgres::config::Config;
-use postgres::{Client, NoTls};
-use reqwest::StatusCode;
-use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
+use tracing::{error, info, instrument, warn};
 
 use crate::config;
-use crate::logger::inlinify;
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
 use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
-use compute_api::spec::{ComputeSpec, PgIdent, Role};
+use compute_api::spec::ComputeSpec;
 
 // Do control plane request and return response if any. In case of error it
 // returns a bool flag indicating whether it makes sense to retry the request
@@ -151,625 +146,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
     Ok(())
 }
 
-/// Compute could be unexpectedly shut down, for example, during the
-/// database dropping. This leaves the database in the invalid state,
-/// which prevents new db creation with the same name. This function
-/// will clean it up before proceeding with catalog updates. All
-/// possible future cleanup operations may go here too.
-#[instrument(skip_all)]
-pub fn cleanup_instance(client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
-
-    for (_, db) in existing_dbs {
-        if db.invalid {
-            // After recent commit in Postgres, interrupted DROP DATABASE
-            // leaves the database in the invalid state. According to the
-            // commit message, the only option for user is to drop it again.
-            // See:
-            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-            //
-            // Postgres Neon extension is done the way, that db is de-registered
-            // in the control plane metadata only after it is dropped. So there is
-            // a chance that it still thinks that db should exist. This means
-            // that it will be re-created by `handle_databases()`. Yet, it's fine
-            // as user can just repeat drop (in vanilla Postgres they would need
-            // to do the same, btw).
-            let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
-            info!("dropping invalid database {}", db.name);
-            client.execute(query.as_str(), &[])?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Given a cluster spec json and open transaction it handles roles creation,
-/// deletion and update.
-#[instrument(skip_all)]
-pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let mut xact = client.transaction()?;
-    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
-
-    let mut jwks_roles = HashSet::new();
-    if let Some(local_proxy) = &spec.local_proxy_config {
-        for jwks_setting in local_proxy.jwks.iter().flatten() {
-            for role_name in &jwks_setting.role_names {
-                jwks_roles.insert(role_name.clone());
-            }
-        }
-    }
-
-    // Print a list of existing Postgres roles (only in debug mode)
-    if span_enabled!(Level::INFO) {
-        let mut vec = Vec::new();
-        for r in &existing_roles {
-            vec.push(format!(
-                "{}:{}",
-                r.name,
-                if r.encrypted_password.is_some() {
-                    "[FILTERED]"
-                } else {
-                    "(null)"
-                }
-            ));
-        }
-
-        info!("postgres roles (total {}): {:?}", vec.len(), vec);
-    }
-
-    // Process delta operations first
-    if let Some(ops) = &spec.delta_operations {
-        info!("processing role renames");
-        for op in ops {
-            match op.action.as_ref() {
-                "delete_role" => {
-                    // no-op now, roles will be deleted at the end of configuration
-                }
-                // Renaming role drops its password, since role name is
-                // used as a salt there.  It is important that this role
-                // is recorded with a new `name` in the `roles` list.
-                // Follow up roles update will set the new password.
-                "rename_role" => {
-                    let new_name = op.new_name.as_ref().unwrap();
-
-                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
-                    if existing_roles.iter().any(|r| r.name == op.name) {
-                        let query: String = format!(
-                            "ALTER ROLE {} RENAME TO {}",
-                            op.name.pg_quote(),
-                            new_name.pg_quote()
-                        );
-
-                        warn!("renaming role '{}' to '{}'", op.name, new_name);
-                        xact.execute(query.as_str(), &[])?;
-                    }
-                }
-                _ => {}
-            }
-        }
-    }
-
-    // Refresh Postgres roles info to handle possible roles renaming
-    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
-
-    info!(
-        "handling cluster spec roles (total {})",
-        spec.cluster.roles.len()
-    );
-    for role in &spec.cluster.roles {
-        let name = &role.name;
-        // XXX: with a limited number of roles it is fine, but consider making it a HashMap
-        let pg_role = existing_roles.iter().find(|r| r.name == *name);
-
-        enum RoleAction {
-            None,
-            Update,
-            Create,
-        }
-        let action = if let Some(r) = pg_role {
-            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
-                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
-            {
-                RoleAction::Update
-            } else if let Some(pg_pwd) = &r.encrypted_password {
-                // Check whether password changed or not (trim 'md5' prefix first if any)
-                //
-                // This is a backward compatibility hack, which comes from the times when we were using
-                // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
-                // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
-                // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
-                // Here is the only place so far where we compare hashes, so it seems to be the best candidate
-                // to place this compatibility layer.
-                let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
-                    stripped
-                } else {
-                    pg_pwd
-                };
-                if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
-                    RoleAction::Update
-                } else {
-                    RoleAction::None
-                }
-            } else {
-                RoleAction::None
-            }
-        } else {
-            RoleAction::Create
-        };
-
-        match action {
-            RoleAction::None => {}
-            RoleAction::Update => {
-                // This can be run on /every/ role! Not just ones created through the console.
-                // This means that if you add some funny ALTER here that adds a permission,
-                // this will get run even on user-created roles! This will result in different
-                // behavior before and after a spec gets reapplied. The below ALTER as it stands
-                // now only grants LOGIN and changes the password. Please do not allow this branch
-                // to do anything silly.
-                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
-                query.push_str(&role.to_pg_options());
-                xact.execute(query.as_str(), &[])?;
-            }
-            RoleAction::Create => {
-                // This branch only runs when roles are created through the console, so it is
-                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
-                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
-                    name.pg_quote()
-                );
-                if jwks_roles.contains(name.as_str()) {
-                    query = format!("CREATE ROLE {}", name.pg_quote());
-                }
-                info!("running role create query: '{}'", &query);
-                query.push_str(&role.to_pg_options());
-                xact.execute(query.as_str(), &[])?;
-            }
-        }
-
-        if span_enabled!(Level::INFO) {
-            let pwd = if role.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            };
-            let action_str = match action {
-                RoleAction::None => "",
-                RoleAction::Create => " -> create",
-                RoleAction::Update => " -> update",
-            };
-            info!(" - {}:{}{}", name, pwd, action_str);
-        }
-    }
-
-    xact.commit()?;
-
-    Ok(())
-}
-
-/// Reassign all dependent objects and delete requested roles.
-#[instrument(skip_all)]
-pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
-    if let Some(ops) = &spec.delta_operations {
-        // First, reassign all dependent objects to db owners.
-        info!("reassigning dependent objects of to-be-deleted roles");
-
-        // Fetch existing roles. We could've exported and used `existing_roles` from
-        // `handle_roles()`, but we only make this list there before creating new roles.
-        // Which is probably fine as we never create to-be-deleted roles, but that'd
-        // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
-        // buffers already, so this shouldn't be a big deal.
-        let mut xact = client.transaction()?;
-        let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
-        xact.commit()?;
-
-        for op in ops {
-            // Check that role is still present in Postgres, as this could be a
-            // restart with the same spec after role deletion.
-            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
-                reassign_owned_objects(spec, connstr, &op.name)?;
-            }
-        }
-
-        // Second, proceed with role deletions.
-        info!("processing role deletions");
-        let mut xact = client.transaction()?;
-        for op in ops {
-            // We do not check either role exists or not,
-            // Postgres will take care of it for us
-            if op.action == "delete_role" {
-                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote());
-
-                warn!("deleting role '{}'", &op.name);
-                xact.execute(query.as_str(), &[])?;
-            }
-        }
-        xact.commit()?;
-    }
-
-    Ok(())
-}
-
-fn reassign_owned_objects_in_one_db(
-    conf: Config,
-    role_name: &PgIdent,
-    db_owner: &PgIdent,
-) -> Result<()> {
-    let mut client = conf.connect(NoTls)?;
-
-    // This will reassign all dependent objects to the db owner
-    let reassign_query = format!(
-        "REASSIGN OWNED BY {} TO {}",
-        role_name.pg_quote(),
-        db_owner.pg_quote()
-    );
-    info!(
-        "reassigning objects owned by '{}' in db '{}' to '{}'",
-        role_name,
-        conf.get_dbname().unwrap_or(""),
-        db_owner
-    );
-    client.simple_query(&reassign_query)?;
-
-    // This now will only drop privileges of the role
-    let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
-    client.simple_query(&drop_query)?;
-    Ok(())
-}
-
-// Reassign all owned objects in all databases to the owner of the database.
-fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
-    for db in &spec.cluster.databases {
-        if db.owner != *role_name {
-            let mut conf = Config::from_str(connstr)?;
-            conf.dbname(&db.name);
-            reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
-        }
-    }
-
-    // Also handle case when there are no databases in the spec.
-    // In this case we need to reassign objects in the default database.
-    let conf = Config::from_str(connstr)?;
-    let db_owner = PgIdent::from_str("cloud_admin")?;
-    reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
-
-    Ok(())
-}
-
-/// It follows mostly the same logic as `handle_roles()` excepting that we
-/// does not use an explicit transactions block, since major database operations
-/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
-/// atomicity should be enough here due to the order of operations and various checks,
-/// which together provide us idempotency.
-#[instrument(skip_all)]
-pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
-
-    // Print a list of existing Postgres databases (only in debug mode)
-    if span_enabled!(Level::INFO) {
-        let mut vec = Vec::new();
-        for (dbname, db) in &existing_dbs {
-            vec.push(format!("{}:{}", dbname, db.owner));
-        }
-        info!("postgres databases (total {}): {:?}", vec.len(), vec);
-    }
-
-    // Process delta operations first
-    if let Some(ops) = &spec.delta_operations {
-        info!("processing delta operations on databases");
-        for op in ops {
-            match op.action.as_ref() {
-                // We do not check either DB exists or not,
-                // Postgres will take care of it for us
-                "delete_db" => {
-                    // In Postgres we can't drop a database if it is a template.
-                    // So we need to unset the template flag first, but it could
-                    // be a retry, so we could've already dropped the database.
-                    // Check that database exists first to make it idempotent.
-                    let unset_template_query: String = format!(
-                        "
-                        DO $$
-                        BEGIN
-                            IF EXISTS(
-                                SELECT 1
-                                FROM pg_catalog.pg_database
-                                WHERE datname = {}
-                            )
-                            THEN
-                            ALTER DATABASE {} is_template false;
-                            END IF;
-                        END
-                        $$;",
-                        escape_literal(&op.name),
-                        &op.name.pg_quote()
-                    );
-                    // Use FORCE to drop database even if there are active connections.
-                    // We run this from `cloud_admin`, so it should have enough privileges.
-                    // NB: there could be other db states, which prevent us from dropping
-                    // the database. For example, if db is used by any active subscription
-                    // or replication slot.
-                    // TODO: deal with it once we allow logical replication. Proper fix should
-                    // involve returning an error code to the control plane, so it could
-                    // figure out that this is a non-retryable error, return it to the user
-                    // and fail operation permanently.
-                    let drop_db_query: String = format!(
-                        "DROP DATABASE IF EXISTS {} WITH (FORCE)",
-                        &op.name.pg_quote()
-                    );
-
-                    warn!("deleting database '{}'", &op.name);
-                    client.execute(unset_template_query.as_str(), &[])?;
-                    client.execute(drop_db_query.as_str(), &[])?;
-                }
-                "rename_db" => {
-                    let new_name = op.new_name.as_ref().unwrap();
-
-                    if existing_dbs.contains_key(&op.name) {
-                        let query: String = format!(
-                            "ALTER DATABASE {} RENAME TO {}",
-                            op.name.pg_quote(),
-                            new_name.pg_quote()
-                        );
-
-                        warn!("renaming database '{}' to '{}'", op.name, new_name);
-                        client.execute(query.as_str(), &[])?;
-                    }
-                }
-                _ => {}
-            }
-        }
-    }
-
-    // Refresh Postgres databases info to handle possible renames
-    let existing_dbs = get_existing_dbs(client)?;
-
-    info!(
-        "handling cluster spec databases (total {})",
-        spec.cluster.databases.len()
-    );
-    for db in &spec.cluster.databases {
-        let name = &db.name;
-        let pg_db = existing_dbs.get(name);
-
-        enum DatabaseAction {
-            None,
-            Update,
-            Create,
-        }
-        let action = if let Some(r) = pg_db {
-            // XXX: db owner name is returned as quoted string from Postgres,
-            // when quoting is needed.
-            let new_owner = if r.owner.starts_with('"') {
-                db.owner.pg_quote()
-            } else {
-                db.owner.clone()
-            };
-
-            if new_owner != r.owner {
-                // Update the owner
-                DatabaseAction::Update
-            } else {
-                DatabaseAction::None
-            }
-        } else {
-            DatabaseAction::Create
-        };
-
-        match action {
-            DatabaseAction::None => {}
-            DatabaseAction::Update => {
-                let query: String = format!(
-                    "ALTER DATABASE {} OWNER TO {}",
-                    name.pg_quote(),
-                    db.owner.pg_quote()
-                );
-                let _guard = info_span!("executing", query).entered();
-                client.execute(query.as_str(), &[])?;
-            }
-            DatabaseAction::Create => {
-                let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
-                query.push_str(&db.to_pg_options());
-                let _guard = info_span!("executing", query).entered();
-                client.execute(query.as_str(), &[])?;
-                let grant_query: String = format!(
-                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
-                    name.pg_quote()
-                );
-                client.execute(grant_query.as_str(), &[])?;
-            }
-        };
-
-        if span_enabled!(Level::INFO) {
-            let action_str = match action {
-                DatabaseAction::None => "",
-                DatabaseAction::Create => " -> create",
-                DatabaseAction::Update => " -> update",
-            };
-            info!(" - {}:{}{}", db.name, db.owner, action_str);
-        }
-    }
-
-    Ok(())
-}
-
-/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
-/// to allow users creating trusted extensions and re-creating `public` schema, for example.
-#[instrument(skip_all)]
-pub fn handle_grants(
-    spec: &ComputeSpec,
-    client: &mut Client,
-    connstr: &str,
-    enable_anon_extension: bool,
-) -> Result<()> {
-    info!("modifying database permissions");
-    let existing_dbs = get_existing_dbs(client)?;
-
-    // Do some per-database access adjustments. We'd better do this at db creation time,
-    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
-    // atomically.
-    for db in &spec.cluster.databases {
-        match existing_dbs.get(&db.name) {
-            Some(pg_db) => {
-                if pg_db.restrict_conn || pg_db.invalid {
-                    info!(
-                        "skipping grants for db {} (invalid: {}, connections not allowed: {})",
-                        db.name, pg_db.invalid, pg_db.restrict_conn
-                    );
-                    continue;
-                }
-            }
-            None => {
-                bail!(
-                    "database {} doesn't exist in Postgres after handle_databases()",
-                    db.name
-                );
-            }
-        }
-
-        let mut conf = Config::from_str(connstr)?;
-        conf.dbname(&db.name);
-
-        let mut db_client = conf.connect(NoTls)?;
-
-        // This will only change ownership on the schema itself, not the objects
-        // inside it. Without it owner of the `public` schema will be `cloud_admin`
-        // and database owner cannot do anything with it. SQL procedure ensures
-        // that it won't error out if schema `public` doesn't exist.
-        let alter_query = format!(
-            "DO $$\n\
-                DECLARE\n\
-                    schema_owner TEXT;\n\
-                BEGIN\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    )\n\
-                    THEN\n\
-                        SELECT nspowner::regrole::text\n\
-                            FROM pg_catalog.pg_namespace\n\
-                            WHERE nspname = 'public'\n\
-                            INTO schema_owner;\n\
-                \n\
-                        IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\
-                        THEN\n\
-                            ALTER SCHEMA public OWNER TO {};\n\
-                        END IF;\n\
-                    END IF;\n\
-                END\n\
-            $$;",
-            db.owner.pg_quote()
-        );
-        db_client.simple_query(&alter_query)?;
-
-        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed because since postgres 15 this privilege is removed by default.
-        // TODO: web_access isn't created for almost 1 year. It could be that we have
-        // active users of 1 year old projects, but hopefully not, so check it and
-        // remove this code if possible. The worst thing that could happen is that
-        // user won't be able to use public schema in NEW databases created in the
-        // very OLD project.
-        //
-        // Also, alter default permissions so that relations created by extensions can be
-        // used by neon_superuser without permission issues.
-        let grant_query = "DO $$\n\
-                BEGIN\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    ) AND\n\
-                    current_setting('server_version_num')::int/10000 >= 15\n\
-                    THEN\n\
-                        IF EXISTS(\n\
-                            SELECT rolname\n\
-                            FROM pg_catalog.pg_roles\n\
-                            WHERE rolname = 'web_access'\n\
-                        )\n\
-                        THEN\n\
-                            GRANT CREATE ON SCHEMA public TO web_access;\n\
-                        END IF;\n\
-                    END IF;\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    )\n\
-                    THEN\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
-                    END IF;\n\
-                END\n\
-            $$;"
-        .to_string();
-
-        info!(
-            "grant query for db {} : {}",
-            &db.name,
-            inlinify(&grant_query)
-        );
-        db_client.simple_query(&grant_query)?;
-
-        // it is important to run this after all grants
-        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)
-                .context("handle_grants handle_extension_anon")?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Create required system extensions
-#[instrument(skip_all)]
-pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
-        if libs.contains("pg_stat_statements") {
-            // Create extension only if this compute really needs it
-            let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements";
-            info!("creating system extensions with query: {}", query);
-            client.simple_query(query)?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
-#[instrument(skip_all)]
-pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
-    info!("handle extension neon");
-
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
-    client.simple_query(query)?;
-
-    query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
-    info!("create neon extension with query: {}", query);
-    client.simple_query(query)?;
-
-    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
-    client.simple_query(query)?;
-
-    query = "ALTER EXTENSION neon SET SCHEMA neon";
-    info!("alter neon extension schema with query: {}", query);
-    client.simple_query(query)?;
-
-    // this will be a no-op if extension is already up to date,
-    // which may happen in two cases:
-    // - extension was just installed
-    // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    if let Err(e) = client.simple_query(query) {
-        error!(
-            "failed to upgrade neon extension during `handle_extension_neon`: {}",
-            e
-        );
-    }
-
-    Ok(())
-}
-
 #[instrument(skip_all)]
 pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
     info!("handle neon extension upgrade");
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
new file mode 100644
index 0000000000..7308d5d36e
--- /dev/null
+++ b/compute_tools/src/spec_apply.rs
@@ -0,0 +1,680 @@
+use std::collections::{HashMap, HashSet};
+use std::fmt::{Debug, Formatter};
+use std::future::Future;
+use std::iter::empty;
+use std::iter::once;
+use std::sync::Arc;
+
+use crate::compute::construct_superuser_query;
+use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt};
+use anyhow::{bail, Result};
+use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
+use futures::future::join_all;
+use tokio::sync::RwLock;
+use tokio_postgres::Client;
+use tracing::{debug, info_span, Instrument};
+
+#[derive(Clone)]
+pub enum DB {
+    SystemDB,
+    UserDB(Database),
+}
+
+impl DB {
+    pub fn new(db: Database) -> DB {
+        Self::UserDB(db)
+    }
+
+    pub fn is_owned_by(&self, role: &PgIdent) -> bool {
+        match self {
+            DB::SystemDB => false,
+            DB::UserDB(db) => &db.owner == role,
+        }
+    }
+}
+
+impl Debug for DB {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DB::SystemDB => f.debug_tuple("SystemDB").finish(),
+            DB::UserDB(db) => f.debug_tuple("UserDB").field(&db.name).finish(),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum PerDatabasePhase {
+    DeleteDBRoleReferences,
+    ChangeSchemaPerms,
+    HandleAnonExtension,
+}
+
+#[derive(Clone, Debug)]
+pub enum ApplySpecPhase {
+    CreateSuperUser,
+    DropInvalidDatabases,
+    RenameRoles,
+    CreateAndAlterRoles,
+    RenameAndDeleteDatabases,
+    CreateAndAlterDatabases,
+    RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
+    HandleOtherExtensions,
+    HandleNeonExtension,
+    CreateAvailabilityCheck,
+    DropRoles,
+}
+
+pub struct Operation {
+    pub query: String,
+    pub comment: Option<String>,
+}
+
+pub struct MutableApplyContext {
+    pub roles: HashMap<String, Role>,
+    pub dbs: HashMap<String, Database>,
+}
+
+/// Appply the operations that belong to the given spec apply phase.
+///
+/// Commands within a single phase are executed in order of Iterator yield.
+/// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
+/// indicated by its `db` field, and can share a single client for all changes
+/// to that database.
+///
+/// Notes:
+/// - Commands are pipelined, and thus may cause incomplete apply if one
+///   command of many fails.
+/// - Failing commands will fail the phase's apply step once the return value
+///   is processed.
+/// - No timeouts have (yet) been implemented.
+/// - The caller is responsible for limiting and/or applying concurrency.
+pub async fn apply_operations<'a, Fut, F>(
+    spec: Arc<ComputeSpec>,
+    ctx: Arc<RwLock<MutableApplyContext>>,
+    jwks_roles: Arc<HashSet<String>>,
+    apply_spec_phase: ApplySpecPhase,
+    client: F,
+) -> Result<()>
+where
+    F: FnOnce() -> Fut,
+    Fut: Future<Output = Result<&'a Client>>,
+{
+    debug!("Starting phase {:?}", &apply_spec_phase);
+    let span = info_span!("db_apply_changes", phase=?apply_spec_phase);
+    let span2 = span.clone();
+    async move {
+        debug!("Processing phase {:?}", &apply_spec_phase);
+        let ctx = ctx;
+
+        let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase)
+            .await?
+            .peekable();
+
+        // Return (and by doing so, skip requesting the PostgreSQL client) if
+        // we don't have any operations scheduled.
+        if ops.peek().is_none() {
+            return Ok(());
+        }
+
+        let client = client().await?;
+
+        debug!("Applying phase {:?}", &apply_spec_phase);
+
+        let active_queries = ops
+            .map(|op| {
+                let Operation { comment, query } = op;
+                let inspan = match comment {
+                    None => span.clone(),
+                    Some(comment) => info_span!("phase {}: {}", comment),
+                };
+
+                async {
+                    let query = query;
+                    let res = client.simple_query(&query).await;
+                    debug!(
+                        "{} {}",
+                        if res.is_ok() {
+                            "successfully executed"
+                        } else {
+                            "failed to execute"
+                        },
+                        query
+                    );
+                    res
+                }
+                .instrument(inspan)
+            })
+            .collect::<Vec<_>>();
+
+        drop(ctx);
+
+        for it in join_all(active_queries).await {
+            drop(it?);
+        }
+
+        debug!("Completed phase {:?}", &apply_spec_phase);
+
+        Ok(())
+    }
+    .instrument(span2)
+    .await
+}
+
+/// Create a stream of operations to be executed for that phase of applying
+/// changes.
+///
+/// In the future we may generate a single stream of changes and then
+/// sort/merge/batch execution, but for now this is a nice way to improve
+/// batching behaviour of the commands.
+async fn get_operations<'a>(
+    spec: &'a ComputeSpec,
+    ctx: &'a RwLock<MutableApplyContext>,
+    jwks_roles: &'a HashSet<String>,
+    apply_spec_phase: &'a ApplySpecPhase,
+) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
+    match apply_spec_phase {
+        ApplySpecPhase::CreateSuperUser => {
+            let query = construct_superuser_query(spec);
+
+            Ok(Box::new(once(Operation {
+                query,
+                comment: None,
+            })))
+        }
+        ApplySpecPhase::DropInvalidDatabases => {
+            let mut ctx = ctx.write().await;
+            let databases = &mut ctx.dbs;
+
+            let keys: Vec<_> = databases
+                .iter()
+                .filter(|(_, db)| db.invalid)
+                .map(|(dbname, _)| dbname.clone())
+                .collect();
+
+            // After recent commit in Postgres, interrupted DROP DATABASE
+            // leaves the database in the invalid state. According to the
+            // commit message, the only option for user is to drop it again.
+            // See:
+            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+            //
+            // Postgres Neon extension is done the way, that db is de-registered
+            // in the control plane metadata only after it is dropped. So there is
+            // a chance that it still thinks that the db should exist. This means
+            // that it will be re-created by the `CreateDatabases` phase. This
+            // is fine, as user can just drop the table again (in vanilla
+            // Postgres they would need to do the same).
+            let operations = keys
+                .into_iter()
+                .filter_map(move |dbname| ctx.dbs.remove(&dbname))
+                .map(|db| Operation {
+                    query: format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()),
+                    comment: Some(format!("Dropping invalid database {}", db.name)),
+                });
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::RenameRoles => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter(|op| op.action == "rename_role")
+                .filter_map(move |op| {
+                    let roles = &mut ctx.roles;
+
+                    if roles.contains_key(op.name.as_str()) {
+                        None
+                    } else {
+                        let new_name = op.new_name.as_ref().unwrap();
+                        let mut role = roles.remove(op.name.as_str()).unwrap();
+
+                        role.name = new_name.clone();
+                        role.encrypted_password = None;
+                        roles.insert(role.name.clone(), role);
+
+                        Some(Operation {
+                            query: format!(
+                                "ALTER ROLE {} RENAME TO {}",
+                                op.name.pg_quote(),
+                                new_name.pg_quote()
+                            ),
+                            comment: Some(format!("renaming role '{}' to '{}'", op.name, new_name)),
+                        })
+                    }
+                });
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::CreateAndAlterRoles => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec.cluster.roles
+                .iter()
+                .filter_map(move |role| {
+                    let roles = &mut ctx.roles;
+                    let db_role = roles.get(&role.name);
+
+                    match db_role {
+                        Some(db_role) => {
+                            if db_role.encrypted_password != role.encrypted_password {
+                                // This can be run on /every/ role! Not just ones created through the console.
+                                // This means that if you add some funny ALTER here that adds a permission,
+                                // this will get run even on user-created roles! This will result in different
+                                // behavior before and after a spec gets reapplied. The below ALTER as it stands
+                                // now only grants LOGIN and changes the password. Please do not allow this branch
+                                // to do anything silly.
+                                Some(Operation {
+                                    query: format!(
+                                        "ALTER ROLE {} {}",
+                                        role.name.pg_quote(),
+                                        role.to_pg_options(),
+                                    ),
+                                    comment: None,
+                                })
+                            } else {
+                                None
+                            }
+                        }
+                        None => {
+                            let query = if !jwks_roles.contains(role.name.as_str()) {
+                                format!(
+                                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}",
+                                    role.name.pg_quote(),
+                                    role.to_pg_options(),
+                                )
+                            } else {
+                                format!(
+                                    "CREATE ROLE {} {}",
+                                    role.name.pg_quote(),
+                                    role.to_pg_options(),
+                                )
+                            };
+                            Some(Operation {
+                                query,
+                                comment: Some(format!("creating role {}", role.name)),
+                            })
+                        }
+                    }
+                });
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::RenameAndDeleteDatabases => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter_map(move |op| {
+                    let databases = &mut ctx.dbs;
+                    match op.action.as_str() {
+                        // We do not check whether the DB exists or not,
+                        // Postgres will take care of it for us
+                        "delete_db" => {
+                            // In Postgres we can't drop a database if it is a template.
+                            // So we need to unset the template flag first, but it could
+                            // be a retry, so we could've already dropped the database.
+                            // Check that database exists first to make it idempotent.
+                            let unset_template_query: String = format!(
+                                include_str!("sql/unset_template_for_drop_dbs.sql"),
+                                datname_str = escape_literal(&op.name),
+                                datname = &op.name.pg_quote()
+                            );
+
+                            // Use FORCE to drop database even if there are active connections.
+                            // We run this from `cloud_admin`, so it should have enough privileges.
+                            // NB: there could be other db states, which prevent us from dropping
+                            // the database. For example, if db is used by any active subscription
+                            // or replication slot.
+                            // TODO: deal with it once we allow logical replication. Proper fix should
+                            // involve returning an error code to the control plane, so it could
+                            // figure out that this is a non-retryable error, return it to the user
+                            // and fail operation permanently.
+                            let drop_db_query: String = format!(
+                                "DROP DATABASE IF EXISTS {} WITH (FORCE)",
+                                &op.name.pg_quote()
+                            );
+
+                            databases.remove(&op.name);
+
+                            Some(vec![
+                                Operation {
+                                    query: unset_template_query,
+                                    comment: Some(format!(
+                                        "optionally clearing template flags for DB {}",
+                                        op.name,
+                                    )),
+                                },
+                                Operation {
+                                    query: drop_db_query,
+                                    comment: Some(format!("deleting database {}", op.name,)),
+                                },
+                            ])
+                        }
+                        "rename_db" => {
+                            if let Some(mut db) = databases.remove(&op.name) {
+                                // update state of known databases
+                                let new_name = op.new_name.as_ref().unwrap();
+                                db.name = new_name.clone();
+                                databases.insert(db.name.clone(), db);
+
+                                Some(vec![Operation {
+                                    query: format!(
+                                        "ALTER DATABASE {} RENAME TO {}",
+                                        op.name.pg_quote(),
+                                        new_name.pg_quote(),
+                                    ),
+                                    comment: Some(format!(
+                                        "renaming database '{}' to '{}'",
+                                        op.name, new_name
+                                    )),
+                                }])
+                            } else {
+                                None
+                            }
+                        }
+                        _ => None,
+                    }
+                })
+                .flatten();
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::CreateAndAlterDatabases => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec
+                .cluster
+                .databases
+                .iter()
+                .filter_map(move |db| {
+                    let databases = &mut ctx.dbs;
+                    if let Some(edb) = databases.get_mut(&db.name) {
+                        let change_owner = if edb.owner.starts_with('"') {
+                            db.owner.pg_quote() != edb.owner
+                        } else {
+                            db.owner != edb.owner
+                        };
+
+                        edb.owner = db.owner.clone();
+
+                        if change_owner {
+                            Some(vec![Operation {
+                                query: format!(
+                                    "ALTER DATABASE {} OWNER TO {}",
+                                    db.name.pg_quote(),
+                                    db.owner.pg_quote()
+                                ),
+                                comment: Some(format!(
+                                    "changing database owner of database {} to {}",
+                                    db.name, db.owner
+                                )),
+                            }])
+                        } else {
+                            None
+                        }
+                    } else {
+                        databases.insert(db.name.clone(), db.clone());
+
+                        Some(vec![
+                            Operation {
+                                query: format!(
+                                    "CREATE DATABASE {} {}",
+                                    db.name.pg_quote(),
+                                    db.to_pg_options(),
+                                ),
+                                comment: None,
+                            },
+                            Operation {
+                                query: format!(
+                                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
+                                    db.name.pg_quote()
+                                ),
+                                comment: None,
+                            },
+                        ])
+                    }
+                })
+                .flatten();
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::RunInEachDatabase { db, subphase } => {
+            match subphase {
+                PerDatabasePhase::DeleteDBRoleReferences => {
+                    let ctx = ctx.read().await;
+
+                    let operations =
+                        spec.delta_operations
+                            .iter()
+                            .flatten()
+                            .filter(|op| op.action == "delete_role")
+                            .filter_map(move |op| {
+                                if db.is_owned_by(&op.name) {
+                                    return None;
+                                }
+                                if !ctx.roles.contains_key(&op.name) {
+                                    return None;
+                                }
+                                let quoted = op.name.pg_quote();
+                                let new_owner = match &db {
+                                    DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
+                                    DB::UserDB(db) => db.owner.pg_quote(),
+                                };
+
+                                Some(vec![
+                                    // This will reassign all dependent objects to the db owner
+                                    Operation {
+                                        query: format!(
+                                            "REASSIGN OWNED BY {} TO {}",
+                                            quoted, new_owner,
+                                        ),
+                                        comment: None,
+                                    },
+                                    // This now will only drop privileges of the role
+                                    Operation {
+                                        query: format!("DROP OWNED BY {}", quoted),
+                                        comment: None,
+                                    },
+                                ])
+                            })
+                            .flatten();
+
+                    Ok(Box::new(operations))
+                }
+                PerDatabasePhase::ChangeSchemaPerms => {
+                    let ctx = ctx.read().await;
+                    let databases = &ctx.dbs;
+
+                    let db = match &db {
+                        // ignore schema permissions on the system database
+                        DB::SystemDB => return Ok(Box::new(empty())),
+                        DB::UserDB(db) => db,
+                    };
+
+                    if databases.get(&db.name).is_none() {
+                        bail!("database {} doesn't exist in PostgreSQL", db.name);
+                    }
+
+                    let edb = databases.get(&db.name).unwrap();
+
+                    if edb.restrict_conn || edb.invalid {
+                        return Ok(Box::new(empty()));
+                    }
+
+                    let operations = vec![
+                        Operation {
+                            query: format!(
+                                include_str!("sql/set_public_schema_owner.sql"),
+                                db_owner = db.owner.pg_quote()
+                            ),
+                            comment: None,
+                        },
+                        Operation {
+                            query: String::from(include_str!("sql/default_grants.sql")),
+                            comment: None,
+                        },
+                    ]
+                    .into_iter();
+
+                    Ok(Box::new(operations))
+                }
+                PerDatabasePhase::HandleAnonExtension => {
+                    // Only install Anon into user databases
+                    let db = match &db {
+                        DB::SystemDB => return Ok(Box::new(empty())),
+                        DB::UserDB(db) => db,
+                    };
+                    // Never install Anon when it's not enabled as feature
+                    if !spec.features.contains(&ComputeFeature::AnonExtension) {
+                        return Ok(Box::new(empty()));
+                    }
+
+                    // Only install Anon when it's added in preload libraries
+                    let opt_libs = spec.cluster.settings.find("shared_preload_libraries");
+
+                    let libs = match opt_libs {
+                        Some(libs) => libs,
+                        None => return Ok(Box::new(empty())),
+                    };
+
+                    if !libs.contains("anon") {
+                        return Ok(Box::new(empty()));
+                    }
+
+                    let db_owner = db.owner.pg_quote();
+
+                    let operations = vec![
+                        // Create anon extension if this compute needs it
+                        // Users cannot create it themselves, because superuser is required.
+                        Operation {
+                            query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"),
+                            comment: Some(String::from("creating anon extension")),
+                        },
+                        // Initialize anon extension
+                        // This also requires superuser privileges, so users cannot do it themselves.
+                        Operation {
+                            query: String::from("SELECT anon.init()"),
+                            comment: Some(String::from("initializing anon extension data")),
+                        },
+                        Operation {
+                            query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner),
+                            comment: Some(String::from(
+                                "granting anon extension schema permissions",
+                            )),
+                        },
+                        Operation {
+                            query: format!(
+                                "GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}",
+                                db_owner
+                            ),
+                            comment: Some(String::from(
+                                "granting anon extension schema functions permissions",
+                            )),
+                        },
+                        // We need this, because some functions are defined as SECURITY DEFINER.
+                        // In Postgres SECURITY DEFINER functions are executed with the privileges
+                        // of the owner.
+                        // In anon extension this it is needed to access some GUCs, which are only accessible to
+                        // superuser. But we've patched postgres to allow db_owner to access them as well.
+                        // So we need to change owner of these functions to db_owner.
+                        Operation {
+                            query: format!(
+                                include_str!("sql/anon_ext_fn_reassign.sql"),
+                                db_owner = db_owner,
+                            ),
+                            comment: Some(String::from(
+                                "change anon extension functions owner to database_owner",
+                            )),
+                        },
+                        Operation {
+                            query: format!(
+                                "GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}",
+                                db_owner,
+                            ),
+                            comment: Some(String::from(
+                                "granting anon extension tables permissions",
+                            )),
+                        },
+                        Operation {
+                            query: format!(
+                                "GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}",
+                                db_owner,
+                            ),
+                            comment: Some(String::from(
+                                "granting anon extension sequences permissions",
+                            )),
+                        },
+                    ]
+                    .into_iter();
+
+                    Ok(Box::new(operations))
+                }
+            }
+        }
+        // Interestingly, we only install p_s_s in the main database, even when
+        // it's preloaded.
+        ApplySpecPhase::HandleOtherExtensions => {
+            if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+                if libs.contains("pg_stat_statements") {
+                    return Ok(Box::new(once(Operation {
+                        query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
+                        comment: Some(String::from("create system extensions")),
+                    })));
+                }
+            }
+            Ok(Box::new(empty()))
+        }
+        ApplySpecPhase::HandleNeonExtension => {
+            let operations = vec![
+                Operation {
+                    query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
+                    comment: Some(String::from("init: add schema for extension")),
+                },
+                Operation {
+                    query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"),
+                    comment: Some(String::from(
+                        "init: install the extension if not already installed",
+                    )),
+                },
+                Operation {
+                    query: String::from(
+                        "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
+                    ),
+                    comment: Some(String::from("compat/fix: make neon relocatable")),
+                },
+                Operation {
+                    query: String::from("ALTER EXTENSION neon SET SCHEMA neon"),
+                    comment: Some(String::from("compat/fix: alter neon extension schema")),
+                },
+                Operation {
+                    query: String::from("ALTER EXTENSION neon UPDATE"),
+                    comment: Some(String::from("compat/update: update neon extension version")),
+                },
+            ]
+            .into_iter();
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation {
+            query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")),
+            comment: None,
+        }))),
+        ApplySpecPhase::DropRoles => {
+            let operations = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter(|op| op.action == "delete_role")
+                .map(|op| Operation {
+                    query: format!("DROP ROLE IF EXISTS {}", op.name.pg_quote()),
+                    comment: None,
+                });
+
+            Ok(Box::new(operations))
+        }
+    }
+}
diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql
new file mode 100644
index 0000000000..7c60690c78
--- /dev/null
+++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql
@@ -0,0 +1,18 @@
+DO $$
+BEGIN
+    IF NOT EXISTS(
+        SELECT 1
+        FROM pg_catalog.pg_tables
+        WHERE tablename = 'health_check'
+    )
+    THEN
+    CREATE TABLE health_check (
+        id serial primary key,
+        updated_at timestamptz default now()
+    );
+    INSERT INTO health_check VALUES (1, now())
+        ON CONFLICT (id) DO UPDATE
+         SET updated_at = now();
+    END IF;
+END
+$$
\ No newline at end of file
diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql
new file mode 100644
index 0000000000..3d7b15c590
--- /dev/null
+++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql
@@ -0,0 +1,12 @@
+DO $$
+DECLARE
+    query varchar;
+BEGIN
+    FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
+    FROM pg_proc p
+        JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
+    WHERE nsp.nspname = 'anon' LOOP
+        EXECUTE query;
+    END LOOP;
+END
+$$;
diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql
new file mode 100644
index 0000000000..58ebb0690b
--- /dev/null
+++ b/compute_tools/src/sql/default_grants.sql
@@ -0,0 +1,30 @@
+DO
+$$
+    BEGIN
+        IF EXISTS(
+            SELECT nspname
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+        ) AND
+           current_setting('server_version_num')::int / 10000 >= 15
+        THEN
+            IF EXISTS(
+                SELECT rolname
+                FROM pg_catalog.pg_roles
+                WHERE rolname = 'web_access'
+            )
+            THEN
+                GRANT CREATE ON SCHEMA public TO web_access;
+            END IF;
+        END IF;
+        IF EXISTS(
+            SELECT nspname
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+        )
+        THEN
+            ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
+            ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
+        END IF;
+    END
+$$;
\ No newline at end of file
diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql
new file mode 100644
index 0000000000..fd061a713e
--- /dev/null
+++ b/compute_tools/src/sql/set_public_schema_owner.sql
@@ -0,0 +1,23 @@
+DO
+$$
+    DECLARE
+        schema_owner TEXT;
+    BEGIN
+        IF EXISTS(
+            SELECT nspname
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+        )
+        THEN
+            SELECT nspowner::regrole::text
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+            INTO schema_owner;
+
+            IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
+            THEN
+                ALTER SCHEMA public OWNER TO {db_owner};
+            END IF;
+        END IF;
+    END
+$$;
\ No newline at end of file
diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
new file mode 100644
index 0000000000..6c4343a589
--- /dev/null
+++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
@@ -0,0 +1,12 @@
+DO $$
+    BEGIN
+        IF EXISTS(
+            SELECT 1
+            FROM pg_catalog.pg_database
+            WHERE datname = {datname_str}
+        )
+        THEN
+            ALTER DATABASE {datname} is_template false;
+        END IF;
+    END
+$$;
\ No newline at end of file

From 0a499a317614a049bab4e1166984557789460793 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 20 Nov 2024 06:44:23 +0100
Subject: [PATCH 04/51] Don't preload offloaded timelines (#9646)

In timeline preloading, we also do a preload for offloaded timelines.
This includes the download of `index-part.json`. Ultimately, such a
download is wasteful, therefore avoid it. Same goes for the remote
client, we just discard it immediately thereafter.

Part of #8088

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs | 71 +++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 37bf83c984..8e9e3890ba 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -249,7 +249,8 @@ struct TimelinePreload {
 
 pub(crate) struct TenantPreload {
     tenant_manifest: TenantManifest,
-    timelines: HashMap<TimelineId, TimelinePreload>,
+    /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest.
+    timelines: HashMap<TimelineId, Option<TimelinePreload>>,
 }
 
 /// When we spawn a tenant, there is a special mode for tenant creation that
@@ -1397,7 +1398,7 @@ impl Tenant {
         // Get list of remote timelines
         // download index files for every tenant timeline
         info!("listing remote timelines");
-        let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines(
+        let (mut remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines(
             remote_storage,
             self.tenant_shard_id,
             cancel.clone(),
@@ -1431,11 +1432,27 @@ impl Tenant {
             warn!("Unexpected non timeline key {k}");
         }
 
+        // Avoid downloading IndexPart of offloaded timelines.
+        let mut offloaded_with_prefix = HashSet::new();
+        for offloaded in tenant_manifest.offloaded_timelines.iter() {
+            if remote_timeline_ids.remove(&offloaded.timeline_id) {
+                offloaded_with_prefix.insert(offloaded.timeline_id);
+            } else {
+                // We'll take care later of timelines in the manifest without a prefix
+            }
+        }
+
+        let timelines = self
+            .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+            .await?;
+
         Ok(TenantPreload {
             tenant_manifest,
-            timelines: self
-                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
-                .await?,
+            timelines: timelines
+                .into_iter()
+                .map(|(id, tl)| (id, Some(tl)))
+                .chain(offloaded_with_prefix.into_iter().map(|id| (id, None)))
+                .collect(),
         })
     }
 
@@ -1466,6 +1483,19 @@ impl Tenant {
             offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
             offloaded_timeline_ids.insert(timeline_id);
         }
+        // Complete deletions for offloaded timeline id's from manifest.
+        // The manifest will be uploaded later in this function.
+        offloaded_timelines_list
+            .retain(|(offloaded_id, offloaded)| {
+                // Existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
+                // If there is dangling references in another location, they need to be cleaned up.
+                let delete = !preload.timelines.contains_key(offloaded_id);
+                if delete {
+                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
+                    offloaded.defuse_for_tenant_drop();
+                }
+                !delete
+        });
 
         let mut timelines_to_resume_deletions = vec![];
 
@@ -1473,10 +1503,9 @@ impl Tenant {
         let mut timeline_ancestors = HashMap::new();
         let mut existent_timelines = HashSet::new();
         for (timeline_id, preload) in preload.timelines {
-            if offloaded_timeline_ids.remove(&timeline_id) {
-                // The timeline is offloaded, skip loading it.
-                continue;
-            }
+            let Some(preload) = preload else { continue };
+            // This is an invariant of the `preload` function's API
+            assert!(!offloaded_timeline_ids.contains(&timeline_id));
             let index_part = match preload.index_part {
                 Ok(i) => {
                     debug!("remote index part exists for timeline {timeline_id}");
@@ -1586,31 +1615,13 @@ impl Tenant {
             .context("resume_deletion")
             .map_err(LoadLocalTimelineError::ResumeDeletion)?;
         }
-        // Complete deletions for offloaded timeline id's.
-        offloaded_timelines_list
-            .retain(|(offloaded_id, offloaded)| {
-                // At this point, offloaded_timeline_ids has the list of all offloaded timelines
-                // without a prefix in S3, so they are inexistent.
-                // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
-                // If there is a dangling reference in another location, they need to be cleaned up.
-                let delete = offloaded_timeline_ids.contains(offloaded_id);
-                if delete {
-                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
-                    offloaded.defuse_for_tenant_drop();
-                }
-                !delete
-        });
-        if !offloaded_timelines_list.is_empty() {
-            tracing::info!(
-                "Tenant has {} offloaded timelines",
-                offloaded_timelines_list.len()
-            );
-        }
+        let needs_manifest_upload =
+            offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len();
         {
             let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
             offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
         }
-        if !offloaded_timeline_ids.is_empty() {
+        if needs_manifest_upload {
             self.store_tenant_manifest().await?;
         }
 

From 3ae0b2149e1a80975e098a4156b424e636cc550f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 20 Nov 2024 10:14:28 +0000
Subject: [PATCH 05/51] chore(proxy): demote a ton of logs for successful
 connection attempts (#9803)

See https://github.com/neondatabase/cloud/issues/14378

In collaboration with @cloneable and @awarus, we sifted through logs and
simply demoted some logs to debug. This is not at all finished and there
are more logs to review, but we ran out of time in the session we
organised. In any slightly more nuanced cases, we didn't touch the log,
instead leaving a TODO comment.
---
 proxy/src/auth/backend/classic.rs      |  8 +++++---
 proxy/src/auth/backend/hacks.rs        |  2 +-
 proxy/src/auth/backend/mod.rs          |  7 ++++---
 proxy/src/auth/credentials.rs          | 10 +++++-----
 proxy/src/auth/flow.rs                 |  2 ++
 proxy/src/bin/local_proxy.rs           |  1 +
 proxy/src/bin/proxy.rs                 |  1 +
 proxy/src/cancellation.rs              | 10 +++++-----
 proxy/src/compute.rs                   |  5 +++--
 proxy/src/console_redirect_proxy.rs    |  2 +-
 proxy/src/context/mod.rs               |  9 +++++++--
 proxy/src/control_plane/client/mod.rs  |  4 ++--
 proxy/src/control_plane/client/neon.rs | 13 +++++++++----
 proxy/src/stream.rs                    |  1 +
 14 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 6d26c99832..87a02133c8 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,5 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 
 use super::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::backend::ComputeCredentialKeys;
@@ -21,11 +21,11 @@ pub(super) async fn authenticate(
     let scram_keys = match secret {
         #[cfg(any(test, feature = "testing"))]
         AuthSecret::Md5(_) => {
-            info!("auth endpoint chooses MD5");
+            debug!("auth endpoint chooses MD5");
             return Err(auth::AuthError::bad_auth_method("MD5"));
         }
         AuthSecret::Scram(secret) => {
-            info!("auth endpoint chooses SCRAM");
+            debug!("auth endpoint chooses SCRAM");
             let scram = auth::Scram(&secret, ctx);
 
             let auth_outcome = tokio::time::timeout(
@@ -50,6 +50,8 @@ pub(super) async fn authenticate(
             let client_key = match auth_outcome {
                 sasl::Outcome::Success(key) => key,
                 sasl::Outcome::Failure(reason) => {
+                    // TODO: warnings?
+                    // TODO: should we get rid of this because double logging?
                     info!("auth backend failed with an error: {reason}");
                     return Err(auth::AuthError::password_failed(&*creds.user));
                 }
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 1411d908a5..e651df1d34 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -73,7 +73,7 @@ pub(crate) async fn password_hack_no_authentication(
         .get_password()
         .await?;
 
-    info!(project = &*payload.endpoint, "received missing parameter");
+    debug!(project = &*payload.endpoint, "received missing parameter");
 
     // Report tentative success; compute node will check the password anyway.
     Ok((
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 242fe99de2..83c72e7be0 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -14,7 +14,7 @@ use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
@@ -286,7 +286,7 @@ async fn auth_quirks(
         Ok(info) => (info, None),
     };
 
-    info!("fetching user's authentication info");
+    debug!("fetching user's authentication info");
     let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
@@ -404,7 +404,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
     ) -> auth::Result<Backend<'a, ComputeCredentials>> {
         let res = match self {
             Self::ControlPlane(api, user_info) => {
-                info!(
+                debug!(
                     user = &*user_info.user,
                     project = user_info.endpoint(),
                     "performing authentication using the console"
@@ -427,6 +427,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
             }
         };
 
+        // TODO: replace with some metric
         info!("user successfully authenticated");
         Ok(res)
     }
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index ddecae6af5..dab9007400 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -7,7 +7,7 @@ use std::str::FromStr;
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use thiserror::Error;
-use tracing::{info, warn};
+use tracing::{debug, warn};
 
 use crate::auth::password_hack::parse_endpoint_param;
 use crate::context::RequestMonitoring;
@@ -147,22 +147,22 @@ impl ComputeUserInfoMaybeEndpoint {
         }
 
         let metrics = Metrics::get();
-        info!(%user, "credentials");
+        debug!(%user, "credentials");
         if sni.is_some() {
-            info!("Connection with sni");
+            debug!("Connection with sni");
             metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
         } else if endpoint.is_some() {
             metrics
                 .proxy
                 .accepted_connections_by_sni
                 .inc(SniKind::NoSni);
-            info!("Connection without sni");
+            debug!("Connection without sni");
         } else {
             metrics
                 .proxy
                 .accepted_connections_by_sni
                 .inc(SniKind::PasswordHack);
-            info!("Connection with password hack");
+            debug!("Connection with password hack");
         }
 
         let options = NeonOptions::parse_params(params);
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 6294549ff6..1740b59b14 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -178,6 +178,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
             SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
             _ => {}
         }
+
+        // TODO: make this a metric instead
         info!("client chooses {}", sasl.method);
 
         let outcome = sasl::SaslStream::new(self.stream, sasl.message)
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 41b0e11e85..c4ec1300f2 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -125,6 +125,7 @@ async fn main() -> anyhow::Result<()> {
 
     Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
 
+    // TODO: refactor these to use labels
     debug!("Version: {GIT_VERSION}");
     debug!("Build_tag: {BUILD_TAG}");
     let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index fda5b25961..232721338d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -288,6 +288,7 @@ async fn main() -> anyhow::Result<()> {
     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
+    // TODO: refactor these to use labels
     info!("Version: {GIT_VERSION}");
     info!("Build_tag: {BUILD_TAG}");
     let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index db0970adcb..3ad2d55b53 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -7,7 +7,7 @@ use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
 use tokio_postgres::{CancelToken, NoTls};
-use tracing::info;
+use tracing::{debug, info};
 use uuid::Uuid;
 
 use crate::error::ReportableError;
@@ -73,7 +73,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             break key;
         };
 
-        info!("registered new query cancellation key {key}");
+        debug!("registered new query cancellation key {key}");
         Session {
             key,
             cancellation_handler: self,
@@ -165,7 +165,7 @@ impl CancelClosure {
     pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
-        info!("query was cancelled");
+        debug!("query was cancelled");
         Ok(())
     }
 }
@@ -182,7 +182,7 @@ impl<P> Session<P> {
     /// Store the cancel token for the given session.
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
     pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
-        info!("enabling query cancellation for this session");
+        debug!("enabling query cancellation for this session");
         self.cancellation_handler
             .map
             .insert(self.key, Some(cancel_closure));
@@ -194,7 +194,7 @@ impl<P> Session<P> {
 impl<P> Drop for Session<P> {
     fn drop(&mut self) {
         self.cancellation_handler.map.remove(&self.key);
-        info!("dropped query cancellation key {}", &self.key);
+        debug!("dropped query cancellation key {}", &self.key);
     }
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ca4a348ed8..b8876b44eb 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -14,7 +14,7 @@ use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres_rustls::MakeRustlsConnect;
-use tracing::{error, info, warn};
+use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
 use crate::cancellation::CancelClosure;
@@ -213,7 +213,7 @@ impl ConnCfg {
         };
 
         let connect_once = |host, port| {
-            info!("trying to connect to compute node at {host}:{port}");
+            debug!("trying to connect to compute node at {host}:{port}");
             connect_with_timeout(host, port).and_then(|socket| async {
                 let socket_addr = socket.peer_addr()?;
                 // This prevents load balancer from severing the connection.
@@ -328,6 +328,7 @@ impl ConnCfg {
         tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
+        // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
         info!(
             cold_start_info = ctx.cold_start_info().as_str(),
             "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index cc456f3667..8e71f552a5 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -146,7 +146,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
-    info!(
+    debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 6cf99c0c97..d057ee0bfd 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
-use tracing::{debug, info, info_span, Span};
+use tracing::{debug, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
@@ -122,6 +122,7 @@ impl RequestMonitoring {
         protocol: Protocol,
         region: &'static str,
     ) -> Self {
+        // TODO: be careful with long lived spans
         let span = info_span!(
             "connect_request",
             %protocol,
@@ -384,6 +385,10 @@ impl RequestMonitoringInner {
         } else {
             ConnectOutcome::Failed
         };
+
+        // TODO: get rid of entirely/refactor
+        // check for false positives
+        // AND false negatives
         if let Some(rejected) = self.rejected {
             let ep = self
                 .endpoint_id
@@ -391,7 +396,7 @@ impl RequestMonitoringInner {
                 .map(|x| x.as_str())
                 .unwrap_or_default();
             // This makes sense only if cache is disabled
-            info!(
+            debug!(
                 ?outcome,
                 ?rejected,
                 ?ep,
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index e388d8a538..50903e2f1e 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -8,7 +8,7 @@ use std::time::Duration;
 
 use dashmap::DashMap;
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{debug, info};
 
 use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
 use crate::auth::backend::ComputeUserInfo;
@@ -214,7 +214,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         self.metrics
             .semaphore_acquire_seconds
             .observe(now.elapsed().as_secs_f64());
-        info!("acquired permit {:?}", now.elapsed().as_secs_f64());
+        debug!("acquired permit {:?}", now.elapsed().as_secs_f64());
         Ok(WakeComputePermit { permit: permit? })
     }
 
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 26ff4e1402..8f4ae13f33 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -73,6 +73,8 @@ impl NeonControlPlaneClient {
             .endpoints_cache
             .is_valid(ctx, &user_info.endpoint.normalize())
         {
+            // TODO: refactor this because it's weird
+            // this is a failure to authenticate but we return Ok.
             info!("endpoint is not valid, skipping the request");
             return Ok(AuthInfo::default());
         }
@@ -92,7 +94,7 @@ impl NeonControlPlaneClient {
                 ])
                 .build()?;
 
-            info!(url = request.url().as_str(), "sending http request");
+            debug!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
@@ -104,10 +106,12 @@ impl NeonControlPlaneClient {
                 // TODO(anna): retry
                 Err(e) => {
                     return if e.get_reason().is_not_found() {
+                        // TODO: refactor this because it's weird
+                        // this is a failure to authenticate but we return Ok.
                         Ok(AuthInfo::default())
                     } else {
                         Err(e.into())
-                    }
+                    };
                 }
             };
 
@@ -163,7 +167,7 @@ impl NeonControlPlaneClient {
                 .build()
                 .map_err(GetEndpointJwksError::RequestBuild)?;
 
-            info!(url = request.url().as_str(), "sending http request");
+            debug!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self
@@ -220,7 +224,7 @@ impl NeonControlPlaneClient {
 
             let request = request_builder.build()?;
 
-            info!(url = request.url().as_str(), "sending http request");
+            debug!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
@@ -249,6 +253,7 @@ impl NeonControlPlaneClient {
             Ok(node)
         }
         .map_err(crate::error::log_error)
+        // TODO: redo this span stuff
         .instrument(info_span!("http", id = request_id))
         .await
     }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 89df48c5d3..11f426819d 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -133,6 +133,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         msg: &'static str,
         error_kind: ErrorKind,
     ) -> Result<T, ReportedError> {
+        // TODO: only log this for actually interesting errors
         tracing::info!(
             kind = error_kind.to_metric_label(),
             msg,

From 33dce25af8ea722b3bf53467616fb5156dd41249 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 11:07:45 +0000
Subject: [PATCH 06/51] safekeeper: block deletion on protocol handler shutdown
 (#9364)

## Problem

Two recently observed log errors indicate safekeeper tasks for a
timeline running after that timeline's deletion has started.
- https://github.com/neondatabase/neon/issues/8972
- https://github.com/neondatabase/neon/issues/8974

These code paths do not have a mechanism that coordinates task shutdown
with the overall shutdown of the timeline.

## Summary of changes

- Add a `Gate` to `Timeline`
- Take the gate as part of resident timeline guard: any code that holds
a guard over a timeline staying resident should also hold a guard over
the timeline's total lifetime.
- Take the gate from the wal removal task
- Respect Timeline::cancel in WAL send/recv code, so that we do not
block shutdown indefinitely.
- Add a test that deletes timelines with open pageserver+compute
connections, to check these get torn down as expected.

There is some risk to introducing gates: if there is code holding a gate
which does not properly respect a cancellation token, it can cause
shutdown hangs. The risk of this for safekeepers is lower in practice
than it is for other services, because in a healthy timeline deletion,
the compute is shutdown first, then the timeline is deleted on the
pageserver, and finally it is deleted on the safekeepers -- that makes
it much less likely that some protocol handler will still be running.

Closes: #8972
Closes: #8974
---
 libs/postgres_backend/src/lib.rs         |   7 +-
 safekeeper/src/receive_wal.rs            |  29 +++++-
 safekeeper/src/send_wal.rs               |  37 ++++---
 safekeeper/src/timeline.rs               | 120 +++++++++--------------
 safekeeper/src/timeline_guard.rs         |  11 ++-
 safekeeper/src/timeline_manager.rs       |  55 +++++++++--
 safekeeper/src/timelines_global_map.rs   |   4 +-
 safekeeper/src/wal_backup.rs             |  42 +++++---
 test_runner/regress/test_wal_acceptor.py |  83 ++++++++++++++++
 9 files changed, 270 insertions(+), 118 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 9075a019b4..8c024375c1 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -834,7 +834,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         use CopyStreamHandlerEnd::*;
 
         let expected_end = match &end {
-            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true,
+            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
             CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
                 if is_expected_io_error(io_error) =>
             {
@@ -874,6 +874,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             // message from server' when it receives ErrorResponse (anything but
             // CopyData/CopyDone) back.
             CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
+
+            // When cancelled, send no response: we must not risk blocking on sending that response
+            Cancelled => None,
             _ => None,
         };
         if let Some((err, errcode)) = err_to_send_and_errcode {
@@ -1051,6 +1054,8 @@ pub enum CopyStreamHandlerEnd {
     /// The connection was lost
     #[error("connection error: {0}")]
     Disconnected(#[from] ConnectionError),
+    #[error("Shutdown")]
+    Cancelled,
     /// Some other error
     #[error(transparent)]
     Other(#[from] anyhow::Error),
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 2edcc4ef6f..bfa1764abf 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -239,6 +239,10 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         tli: &mut Option<WalResidentTimeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
+        // The `tli` parameter is only used for passing _out_ a timeline, one should
+        // not have been passed in.
+        assert!(tli.is_none());
+
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
 
@@ -256,6 +260,7 @@ impl SafekeeperPostgresHandler {
         // sends, so this avoids deadlocks.
         let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
         let peer_addr = *pgb.get_peer_addr();
+
         let mut network_reader = NetworkReader {
             ttid: self.ttid,
             conn_id: self.conn_id,
@@ -275,10 +280,14 @@ impl SafekeeperPostgresHandler {
                     .subscribe();
             *tli = Some(timeline.wal_residence_guard().await?);
 
+            let timeline_cancel = timeline.cancel.clone();
             tokio::select! {
                 // todo: add read|write .context to these errors
                 r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
                 r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
+                _ = timeline_cancel.cancelled() => {
+                    return Err(CopyStreamHandlerEnd::Cancelled);
+                }
             }
         } else {
             res.map(|_| ())
@@ -303,7 +312,7 @@ impl SafekeeperPostgresHandler {
 
                 // Otherwise, WalAcceptor thread must have errored.
                 match wal_acceptor_res {
-                    Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination
+                    Ok(Ok(_)) => Ok(()), // Clean shutdown
                     Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))),
                     Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!(
                         "WalAcceptor task panicked",
@@ -356,6 +365,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
         Ok((tli, next_msg))
     }
 
+    /// This function is cancellation-safe (only does network I/O and channel read/writes).
     async fn run(
         self,
         msg_tx: Sender<ProposerAcceptorMessage>,
@@ -397,6 +407,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
     loop {
         let started = Instant::now();
         let size = next_msg.size();
+
         match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await {
             Ok(()) => {}
             // Slow send, log a message and keep trying. Log context has timeline ID.
@@ -428,6 +439,8 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
 /// Read replies from WalAcceptor and pass them back to socket. Returns Ok(())
 /// if reply_rx closed; it must mean WalAcceptor terminated, joining it should
 /// tell the error.
+///
+/// This function is cancellation-safe (only does network I/O and channel read/writes).
 async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_writer: &mut PostgresBackend<IO>,
     mut reply_rx: Receiver<AcceptorProposerMessage>,
@@ -461,7 +474,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
                         Some(AcceptorProposerMessage::AppendResponse(append_response))
                     }
                     _ => None,
-                }
+                },
         };
 
         let Some(msg) = msg else {
@@ -527,6 +540,10 @@ impl WalAcceptor {
 
     /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
     /// it must mean that network thread terminated.
+    ///
+    /// This function is *not* cancellation safe, it does local disk I/O: it should always
+    /// be allowed to run to completion. It respects Timeline::cancel and shuts down cleanly
+    /// when that gets triggered.
     async fn run(&mut self) -> anyhow::Result<()> {
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
 
@@ -541,7 +558,7 @@ impl WalAcceptor {
         // Tracks whether we have unflushed appends.
         let mut dirty = false;
 
-        loop {
+        while !self.tli.is_cancelled() {
             let reply = tokio::select! {
                 // Process inbound message.
                 msg = self.msg_rx.recv() => {
@@ -599,6 +616,10 @@ impl WalAcceptor {
                     WAL_RECEIVER_QUEUE_DEPTH.observe(self.msg_rx.len() as f64);
                     None // no reply
                 }
+
+                _ = self.tli.cancel.cancelled() => {
+                    break;
+                }
             };
 
             // Send reply, if any.
@@ -610,7 +631,7 @@ impl WalAcceptor {
         }
 
         // Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
-        if dirty {
+        if dirty && !self.tli.cancel.is_cancelled() {
             self.tli
                 .process_msg(&ProposerAcceptorMessage::FlushWAL)
                 .await?;
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 6d94ff98b1..aa65ec851b 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -456,6 +456,8 @@ impl SafekeeperPostgresHandler {
         // not synchronized with sends, so this avoids deadlocks.
         let reader = pgb.split().context("START_REPLICATION split")?;
 
+        let tli_cancel = tli.cancel.clone();
+
         let mut sender = WalSender {
             pgb,
             // should succeed since we're already holding another guard
@@ -479,6 +481,9 @@ impl SafekeeperPostgresHandler {
             // todo: add read|write .context to these errors
             r = sender.run() => r,
             r = reply_reader.run() => r,
+            _ = tli_cancel.cancelled() => {
+                return Err(CopyStreamHandlerEnd::Cancelled);
+            }
         };
 
         let ws_state = ws_guard
@@ -557,6 +562,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     /// Send WAL until
     /// - an error occurs
     /// - receiver is caughtup and there is no computes (if streaming up to commit_lsn)
+    /// - timeline's cancellation token fires
     ///
     /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
     /// convenience.
@@ -601,15 +607,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
             };
             let send_buf = &send_buf[..send_size];
 
-            // and send it
-            self.pgb
-                .write_message(&BeMessage::XLogData(XLogDataBody {
-                    wal_start: self.start_pos.0,
-                    wal_end: self.end_pos.0,
-                    timestamp: get_current_timestamp(),
-                    data: send_buf,
-                }))
-                .await?;
+            // and send it, while respecting Timeline::cancel
+            let msg = BeMessage::XLogData(XLogDataBody {
+                wal_start: self.start_pos.0,
+                wal_end: self.end_pos.0,
+                timestamp: get_current_timestamp(),
+                data: send_buf,
+            });
+            self.pgb.write_message(&msg).await?;
 
             if let Some(appname) = &self.appname {
                 if appname == "replica" {
@@ -674,13 +679,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 }
             }
 
-            self.pgb
-                .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
-                    wal_end: self.end_pos.0,
-                    timestamp: get_current_timestamp(),
-                    request_reply: true,
-                }))
-                .await?;
+            let msg = BeMessage::KeepAlive(WalSndKeepAlive {
+                wal_end: self.end_pos.0,
+                timestamp: get_current_timestamp(),
+                request_reply: true,
+            });
+
+            self.pgb.write_message(&msg).await?;
         }
     }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 85add6bfea..ef928f7633 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;
+use utils::sync::gate::Gate;
 
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
@@ -467,6 +468,10 @@ pub struct Timeline {
     timeline_dir: Utf8PathBuf,
     manager_ctl: ManagerCtl,
 
+    /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
+    /// this gate, you must respect [`Timeline::cancel`]
+    pub(crate) gate: Gate,
+
     /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
     pub(crate) cancel: CancellationToken,
 
@@ -508,6 +513,7 @@ impl Timeline {
             mutex: RwLock::new(shared_state),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
+            gate: Default::default(),
             cancel: CancellationToken::default(),
             manager_ctl: ManagerCtl::new(),
             broker_active: AtomicBool::new(false),
@@ -533,56 +539,6 @@ impl Timeline {
         ))
     }
 
-    /// Initialize fresh timeline on disk and start background tasks. If init
-    /// fails, timeline is cancelled and cannot be used anymore.
-    ///
-    /// Init is transactional, so if it fails, created files will be deleted,
-    /// and state on disk should remain unchanged.
-    pub async fn init_new(
-        self: &Arc<Timeline>,
-        shared_state: &mut WriteGuardSharedState<'_>,
-        conf: &SafeKeeperConf,
-        broker_active_set: Arc<TimelinesSet>,
-        partial_backup_rate_limiter: RateLimiter,
-    ) -> Result<()> {
-        match fs::metadata(&self.timeline_dir).await {
-            Ok(_) => {
-                // Timeline directory exists on disk, we should leave state unchanged
-                // and return error.
-                bail!(TimelineError::Invalid(self.ttid));
-            }
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => {
-                return Err(e.into());
-            }
-        }
-
-        // Create timeline directory.
-        fs::create_dir_all(&self.timeline_dir).await?;
-
-        // Write timeline to disk and start background tasks.
-        if let Err(e) = shared_state.sk.state_mut().flush().await {
-            // Bootstrap failed, cancel timeline and remove timeline directory.
-            self.cancel(shared_state);
-
-            if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await {
-                warn!(
-                    "failed to remove timeline {} directory after bootstrap failure: {}",
-                    self.ttid, fs_err
-                );
-            }
-
-            return Err(e);
-        }
-        self.bootstrap(
-            shared_state,
-            conf,
-            broker_active_set,
-            partial_backup_rate_limiter,
-        );
-        Ok(())
-    }
-
     /// Bootstrap new or existing timeline starting background tasks.
     pub fn bootstrap(
         self: &Arc<Timeline>,
@@ -593,33 +549,61 @@ impl Timeline {
     ) {
         let (tx, rx) = self.manager_ctl.bootstrap_manager();
 
+        let Ok(gate_guard) = self.gate.enter() else {
+            // Init raced with shutdown
+            return;
+        };
+
         // Start manager task which will monitor timeline state and update
         // background tasks.
-        tokio::spawn(timeline_manager::main_task(
-            ManagerTimeline { tli: self.clone() },
-            conf.clone(),
-            broker_active_set,
-            tx,
-            rx,
-            partial_backup_rate_limiter,
-        ));
+        tokio::spawn({
+            let this = self.clone();
+            let conf = conf.clone();
+            async move {
+                let _gate_guard = gate_guard;
+                timeline_manager::main_task(
+                    ManagerTimeline { tli: this },
+                    conf,
+                    broker_active_set,
+                    tx,
+                    rx,
+                    partial_backup_rate_limiter,
+                )
+                .await
+            }
+        });
+    }
+
+    /// Background timeline activities (which hold Timeline::gate) will no
+    /// longer run once this function completes.
+    pub async fn shutdown(&self) {
+        info!("timeline {} shutting down", self.ttid);
+        self.cancel.cancel();
+
+        // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts
+        // to read deleted files.
+        self.gate.close().await;
     }
 
     /// Delete timeline from disk completely, by removing timeline directory.
-    /// Background timeline activities will stop eventually.
     ///
     /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but
     /// deletion API endpoint is retriable.
+    ///
+    /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first)
     pub async fn delete(
         &self,
         shared_state: &mut WriteGuardSharedState<'_>,
         only_local: bool,
     ) -> Result<bool> {
-        self.cancel(shared_state);
+        // Assert that [`Self::shutdown`] was already called
+        assert!(self.cancel.is_cancelled());
+        assert!(self.gate.close_complete());
+
+        // Close associated FDs. Nobody will be able to touch timeline data once
+        // it is cancelled, so WAL storage won't be opened again.
+        shared_state.sk.close_wal_store();
 
-        // TODO: It's better to wait for s3 offloader termination before
-        // removing data from s3. Though since s3 doesn't have transactions it
-        // still wouldn't guarantee absense of data after removal.
         let conf = GlobalTimelines::get_global_config();
         if !only_local && conf.is_wal_backup_enabled() {
             // Note: we concurrently delete remote storage data from multiple
@@ -631,16 +615,6 @@ impl Timeline {
         Ok(dir_existed)
     }
 
-    /// Cancel timeline to prevent further usage. Background tasks will stop
-    /// eventually after receiving cancellation signal.
-    fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
-        info!("timeline {} is cancelled", self.ttid);
-        self.cancel.cancel();
-        // Close associated FDs. Nobody will be able to touch timeline data once
-        // it is cancelled, so WAL storage won't be opened again.
-        shared_state.sk.close_wal_store();
-    }
-
     /// Returns if timeline is cancelled.
     pub fn is_cancelled(&self) -> bool {
         self.cancel.is_cancelled()
diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index 1ddac573d2..9102a40df8 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -7,6 +7,7 @@
 use std::collections::HashSet;
 
 use tracing::debug;
+use utils::sync::gate::GateGuard;
 
 use crate::timeline_manager::ManagerCtlMessage;
 
@@ -16,6 +17,12 @@ pub struct GuardId(u64);
 pub struct ResidenceGuard {
     manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
     guard_id: GuardId,
+
+    /// [`ResidenceGuard`] represents a guarantee that a timeline's data remains resident,
+    /// which by extension also means the timeline is not shut down (since after shut down
+    /// our data may be deleted). Therefore everyone holding a residence guard must also
+    /// hold a guard on [`crate::timeline::Timeline::gate`]
+    _gate_guard: GateGuard,
 }
 
 impl Drop for ResidenceGuard {
@@ -52,7 +59,8 @@ impl AccessService {
         self.guards.is_empty()
     }
 
-    pub(crate) fn create_guard(&mut self) -> ResidenceGuard {
+    /// `timeline_gate_guard` is a guarantee that the timeline is not shut down
+    pub(crate) fn create_guard(&mut self, timeline_gate_guard: GateGuard) -> ResidenceGuard {
         let guard_id = self.next_guard_id;
         self.next_guard_id += 1;
         self.guards.insert(guard_id);
@@ -63,6 +71,7 @@ impl AccessService {
         ResidenceGuard {
             manager_tx: self.manager_tx.clone(),
             guard_id,
+            _gate_guard: timeline_gate_guard,
         }
     }
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index e9fed21bf5..c02fb904cf 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -266,8 +266,10 @@ pub async fn main_task(
 
     // Start recovery task which always runs on the timeline.
     if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled {
-        let tli = mgr.wal_resident_timeline();
-        mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
+        // Recovery task is only spawned if we can get a residence guard (i.e. timeline is not already shutting down)
+        if let Ok(tli) = mgr.wal_resident_timeline() {
+            mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
+        }
     }
 
     // If timeline is evicted, reflect that in the metric.
@@ -375,6 +377,13 @@ pub async fn main_task(
 
     // shutdown background tasks
     if mgr.conf.is_wal_backup_enabled() {
+        if let Some(backup_task) = mgr.backup_task.take() {
+            // If we fell through here, then the timeline is shutting down. This is important
+            // because otherwise joining on the wal_backup handle might hang.
+            assert!(mgr.tli.cancel.is_cancelled());
+
+            backup_task.join().await;
+        }
         wal_backup::update_task(&mut mgr, false, &last_state).await;
     }
 
@@ -442,10 +451,18 @@ impl Manager {
     /// Get a WalResidentTimeline.
     /// Manager code must use this function instead of one from `Timeline`
     /// directly, because it will deadlock.
-    pub(crate) fn wal_resident_timeline(&mut self) -> WalResidentTimeline {
+    ///
+    /// This function is fallible because the guard may not be created if the timeline is
+    /// shutting down.
+    pub(crate) fn wal_resident_timeline(&mut self) -> anyhow::Result<WalResidentTimeline> {
         assert!(!self.is_offloaded);
-        let guard = self.access_service.create_guard();
-        WalResidentTimeline::new(self.tli.clone(), guard)
+        let guard = self.access_service.create_guard(
+            self.tli
+                .gate
+                .enter()
+                .map_err(|_| anyhow::anyhow!("Timeline shutting down"))?,
+        );
+        Ok(WalResidentTimeline::new(self.tli.clone(), guard))
     }
 
     /// Get a snapshot of the timeline state.
@@ -559,6 +576,11 @@ impl Manager {
 
         if removal_horizon_segno > self.last_removed_segno {
             // we need to remove WAL
+            let Ok(timeline_gate_guard) = self.tli.gate.enter() else {
+                tracing::info!("Timeline shutdown, not spawning WAL removal task");
+                return;
+            };
+
             let remover = match self.tli.read_shared_state().await.sk {
                 StateSK::Loaded(ref sk) => {
                     crate::wal_storage::Storage::remove_up_to(&sk.wal_store, removal_horizon_segno)
@@ -573,6 +595,8 @@ impl Manager {
 
             self.wal_removal_task = Some(tokio::spawn(
                 async move {
+                    let _timeline_gate_guard = timeline_gate_guard;
+
                     remover.await?;
                     Ok(removal_horizon_segno)
                 }
@@ -619,10 +643,15 @@ impl Manager {
             return;
         }
 
+        let Ok(resident) = self.wal_resident_timeline() else {
+            // Shutting down
+            return;
+        };
+
         // Get WalResidentTimeline and start partial backup task.
         let cancel = CancellationToken::new();
         let handle = tokio::spawn(wal_backup_partial::main_task(
-            self.wal_resident_timeline(),
+            resident,
             self.conf.clone(),
             self.global_rate_limiter.clone(),
             cancel.clone(),
@@ -664,7 +693,7 @@ impl Manager {
             self.partial_backup_task = None;
         }
 
-        let tli = self.wal_resident_timeline();
+        let tli = self.wal_resident_timeline()?;
         let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
         // Reset might fail e.g. when cfile is already reset but s3 removal
         // failed, so set manager state to None beforehand. In any case caller
@@ -688,7 +717,12 @@ impl Manager {
                 let guard = if self.is_offloaded {
                     Err(anyhow::anyhow!("timeline is offloaded, can't get a guard"))
                 } else {
-                    Ok(self.access_service.create_guard())
+                    match self.tli.gate.enter() {
+                        Ok(gate_guard) => Ok(self.access_service.create_guard(gate_guard)),
+                        Err(_) => Err(anyhow::anyhow!(
+                            "timeline is shutting down, can't get a guard"
+                        )),
+                    }
                 };
 
                 if tx.send(guard).is_err() {
@@ -699,7 +733,10 @@ impl Manager {
                 let result = if self.is_offloaded {
                     None
                 } else {
-                    Some(self.access_service.create_guard())
+                    match self.tli.gate.enter() {
+                        Ok(gate_guard) => Some(self.access_service.create_guard(gate_guard)),
+                        Err(_) => None,
+                    }
                 };
 
                 if tx.send(result).is_err() {
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 33d94da034..067945fd5f 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -457,10 +457,12 @@ impl GlobalTimelines {
             Ok(timeline) => {
                 let was_active = timeline.broker_active.load(Ordering::Relaxed);
 
+                info!("deleting timeline {}, only_local={}", ttid, only_local);
+                timeline.shutdown().await;
+
                 // Take a lock and finish the deletion holding this mutex.
                 let mut shared_state = timeline.write_shared_state().await;
 
-                info!("deleting timeline {}, only_local={}", ttid, only_local);
                 let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
                 Ok(TimelineDeleteForceResult {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 6c87e5a926..34b5dbeaa1 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -25,7 +25,6 @@ use tokio::fs::File;
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
 use tokio::sync::{watch, OnceCell};
-use tokio::time::sleep;
 use tracing::*;
 
 use utils::{id::TenantTimelineId, lsn::Lsn};
@@ -46,6 +45,14 @@ pub struct WalBackupTaskHandle {
     handle: JoinHandle<()>,
 }
 
+impl WalBackupTaskHandle {
+    pub(crate) async fn join(self) {
+        if let Err(e) = self.handle.await {
+            error!("WAL backup task panicked: {}", e);
+        }
+    }
+}
+
 /// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
 pub(crate) fn is_wal_backup_required(
     wal_seg_size: usize,
@@ -74,11 +81,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St
 
             let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
 
-            let async_task = backup_task_main(
-                mgr.wal_resident_timeline(),
-                mgr.conf.backup_parallel_jobs,
-                shutdown_rx,
-            );
+            let Ok(resident) = mgr.wal_resident_timeline() else {
+                info!("Timeline shut down");
+                return;
+            };
+
+            let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx);
 
             let handle = if mgr.conf.current_thread_runtime {
                 tokio::spawn(async_task)
@@ -108,9 +116,7 @@ async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
         // Tell the task to shutdown. Error means task exited earlier, that's ok.
         let _ = wb_handle.shutdown_tx.send(()).await;
         // Await the task itself. TODO: restart panicked tasks earlier.
-        if let Err(e) = wb_handle.handle.await {
-            warn!("WAL backup task panicked: {}", e);
-        }
+        wb_handle.join().await;
     }
 }
 
@@ -214,6 +220,7 @@ async fn backup_task_main(
     let _guard = WAL_BACKUP_TASKS.guard();
     info!("started");
 
+    let cancel = tli.tli.cancel.clone();
     let mut wb = WalBackupTask {
         wal_seg_size: tli.get_wal_seg_size().await,
         commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
@@ -230,25 +237,34 @@ async fn backup_task_main(
         _ = wb.run() => {}
         _ = shutdown_rx.recv() => {
             canceled = true;
+        },
+        _ = cancel.cancelled() => {
+            canceled = true;
         }
     }
     info!("task {}", if canceled { "canceled" } else { "terminated" });
 }
 
 impl WalBackupTask {
+    /// This function must be called from a select! that also respects self.timeline's
+    /// cancellation token.  This is done in [`backup_task_main`].
+    ///
+    /// The future returned by this function is safe to drop at any time because it
+    /// does not write to local disk.
     async fn run(&mut self) {
         let mut backup_lsn = Lsn(0);
 
         let mut retry_attempt = 0u32;
         // offload loop
-        loop {
+        while !self.timeline.cancel.is_cancelled() {
             if retry_attempt == 0 {
                 // wait for new WAL to arrive
                 if let Err(e) = self.commit_lsn_watch_rx.changed().await {
-                    // should never happen, as we hold Arc to timeline.
+                    // should never happen, as we hold Arc to timeline and transmitter's lifetime
+                    // is within Timeline's
                     error!("commit_lsn watch shut down: {:?}", e);
                     return;
-                }
+                };
             } else {
                 // or just sleep if we errored previously
                 let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS;
@@ -256,7 +272,7 @@ impl WalBackupTask {
                 {
                     retry_delay = min(retry_delay, backoff_delay);
                 }
-                sleep(Duration::from_millis(retry_delay)).await;
+                tokio::time::sleep(Duration::from_millis(retry_delay)).await;
             }
 
             let commit_lsn = *self.commit_lsn_watch_rx.borrow();
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 0676b3dd9a..6eaaa3c37f 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1784,6 +1784,89 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             cur.execute("INSERT INTO t (key) VALUES (123)")
 
 
+def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder):
+    """
+    Test deleting timelines on a safekeeper while they're under load.
+
+    This should not happen under normal operation, but it can happen if
+    there is some rogue compute/pageserver that is writing/reading to a
+    safekeeper that we're migrating a timeline away from, or if the timeline
+    is being deleted while such a rogue client is running.
+    """
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+
+    # Create two endpoints that will generate load
+    timeline_id_a = env.create_branch("deleteme_a")
+    timeline_id_b = env.create_branch("deleteme_b")
+
+    endpoint_a = env.endpoints.create("deleteme_a")
+    endpoint_a.start()
+    endpoint_b = env.endpoints.create("deleteme_b")
+    endpoint_b.start()
+
+    # Get tenant and timeline IDs
+    tenant_id = env.initial_tenant
+
+    # Start generating load on both timelines
+    def generate_load(endpoint: Endpoint):
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+                while True:
+                    try:
+                        cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'")
+                    except:  # noqa
+                        # Ignore errors since timeline may be deleted
+                        break
+
+    t_a = threading.Thread(target=generate_load, args=(endpoint_a,))
+    t_b = threading.Thread(target=generate_load, args=(endpoint_b,))
+    try:
+        t_a.start()
+        t_b.start()
+
+        # Let the load run for a bit
+        log.info("Warming up...")
+        time.sleep(2)
+
+        # Safekeeper errors will propagate to the pageserver: it is correct that these are
+        # logged at error severity because they indicate the pageserver is trying to read
+        # a timeline that it shouldn't.
+        env.pageserver.allowed_errors.extend(
+            [
+                ".*Timeline.*was cancelled.*",
+                ".*Timeline.*was not found.*",
+            ]
+        )
+
+        # Try deleting timelines while under load
+        sk = env.safekeepers[0]
+        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+
+        # Delete first timeline
+        log.info(f"Deleting {timeline_id_a}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"]
+
+        # Delete second timeline
+        log.info(f"Deleting {timeline_id_b}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"]
+
+        # Verify timelines are gone from disk
+        sk_data_dir = sk.data_dir
+        assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists()
+        # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists()
+
+    finally:
+        log.info("Stopping endpoints...")
+        # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang
+        endpoint_a.stop(mode="immediate")
+        endpoint_b.stop(mode="immediate")
+        log.info("Joining threads...")
+        t_a.join()
+        t_b.join()
+
+
 # Basic pull_timeline test.
 # When live_sk_change is False, compute is restarted to change set of
 # safekeepers; otherwise it is live reload.

From 94e4a0e2a0d43e066bd006a68eb147333ab0d074 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Wed, 20 Nov 2024 13:04:14 +0100
Subject: [PATCH 07/51] update macos version for runner (#9817)

Closes: https://github.com/neondatabase/neon/issues/9816

Run MacOs builds on `macos-15`.
As `pkg-config` is bundled in runner image, don't install it with `brew`
---
 .github/workflows/neon_extra_builds.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index e827539c80..092831adb9 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -38,7 +38,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
     timeout-minutes: 90
-    runs-on: macos-14
+    runs-on: macos-15
 
     env:
       # Use release build only, to have less debug info around
@@ -52,7 +52,7 @@ jobs:
           submodules: true
 
       - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf icu4c pkg-config
+        run: brew install flex bison openssl protobuf icu4c
 
       - name: Set pg 14 revision for caching
         id: pg_v14_rev

From 46beecacce50bf1d113dbb6f31fe2283a598adf7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 20 Nov 2024 12:23:41 +0000
Subject: [PATCH 08/51] CI(benchmarking): route test failures to
 on-call-qa-staging-stream (#9813)

## Problem

We want to keep `#on-call-staging-stream` channel close to the prod one
and redirect notifications from failing benchmarks to another channel
for investigation.

## Summary of changes
- Send notifications regarding failures in `benchmarking` job to
`#on-call-staging-stream`
- Send notifications regarding failures in `periodic_pagebench` job to
`#on-call-staging-stream`
---
 .github/workflows/benchmarking.yml       | 12 ++++++------
 .github/workflows/periodic_pagebench.yml |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0289f552f9..acea859b4d 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -158,7 +158,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic perf testing: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -506,7 +506,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -643,7 +643,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -759,7 +759,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -874,7 +874,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -974,7 +974,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 615937b5a1..1cce348ae2 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -72,7 +72,7 @@ jobs:
           echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
         fi
 
-    - name: Start Bench with run_id   
+    - name: Start Bench with run_id
       run: |
         curl -k -X 'POST' \
         "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
@@ -116,7 +116,7 @@ jobs:
         -H 'accept: application/gzip' \
         -H "Authorization: Bearer $API_KEY" \
         --output "test_log_${GITHUB_RUN_ID}.gz"
-    
+
     - name: Unzip Test Log and Print it into this job's log
       if: always() && steps.poll_step.outputs.too_many_runs != 'true'
       run: |
@@ -134,13 +134,13 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
     - name: Cleanup Test Resources
-      if: always() 
+      if: always()
       run: |
         curl -k -X 'POST' \
         "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \

From 899933e159b56d8cfe92995befff8b37d6eb55b8 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 20 Nov 2024 12:48:21 +0000
Subject: [PATCH 09/51] scan_log_for_errors: check that regex is correct
 (#9815)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

I've noticed that we have 2 flaky tests which failed with error:
```
re.error: missing ), unterminated subpattern at position 21
```

- `test_timeline_archival_chaos` — has been already fixed
- `test_sharded_tad_interleaved_after_partial_success` — I didn't manage
to find the incorrect regex

[Internal link](https://neonprod.grafana.net/goto/yfmVHV7NR?orgId=1)

## Summary of changes
- Wrap `re.match` in `try..except` block and print incorrect regex
---
 test_runner/fixtures/pageserver/allowed_errors.py | 10 ++++++++--
 test_runner/fixtures/utils.py                     | 10 ++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index d05704c8e0..5059039678 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -25,8 +25,14 @@ def scan_pageserver_log_for_errors(
 
             # It's an ERROR or WARN. Is it in the allow-list?
             for a in allowed_errors:
-                if re.match(a, line):
-                    break
+                try:
+                    if re.match(a, line):
+                        break
+                # We can switch `re.error` with `re.PatternError` after 3.13
+                # https://docs.python.org/3/library/re.html#re.PatternError
+                except re.error:
+                    print(f"Invalid regex: '{a}'", file=sys.stderr)
+                    raise
             else:
                 errors.append((lineno, line))
     return errors
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 96a651f0db..bb45385ea6 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -495,8 +495,14 @@ def scan_log_for_errors(input: Iterable[str], allowed_errors: list[str]) -> list
 
             # It's an ERROR or WARN. Is it in the allow-list?
             for a in allowed_errors:
-                if re.match(a, line):
-                    break
+                try:
+                    if re.match(a, line):
+                        break
+                # We can switch `re.error` with `re.PatternError` after 3.13
+                # https://docs.python.org/3/library/re.html#re.PatternError
+                except re.error:
+                    log.error(f"Invalid regex: '{a}'")
+                    raise
             else:
                 errors.append((lineno, line))
     return errors

From bf7d859a8bdb26a6ac4ce1c17fec948d7bcecdcb Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 20 Nov 2024 13:50:36 +0100
Subject: [PATCH 10/51] proxy: Rename RequestMonitoring to RequestContext
 (#9805)

## Problem

It is called context/ctx everywhere and the Monitoring suffix needlessly
confuses with proper monitoring code.

## Summary of changes

* Rename RequestMonitoring to RequestContext
* Rename RequestMonitoringInner to RequestContextInner
---
 proxy/src/auth/backend/classic.rs          |  4 +--
 proxy/src/auth/backend/console_redirect.rs |  8 +++---
 proxy/src/auth/backend/hacks.rs            |  6 ++--
 proxy/src/auth/backend/jwt.rs              | 26 +++++++++---------
 proxy/src/auth/backend/local.rs            |  4 +--
 proxy/src/auth/backend/mod.rs              | 32 +++++++++++-----------
 proxy/src/auth/credentials.rs              | 30 ++++++++++----------
 proxy/src/auth/flow.rs                     |  4 +--
 proxy/src/bin/pg_sni_router.rs             |  8 +++---
 proxy/src/cache/endpoints.rs               |  4 +--
 proxy/src/compute.rs                       |  4 +--
 proxy/src/console_redirect_proxy.rs        |  6 ++--
 proxy/src/context/mod.rs                   | 22 +++++++--------
 proxy/src/context/parquet.rs               |  6 ++--
 proxy/src/control_plane/client/mock.rs     | 10 +++----
 proxy/src/control_plane/client/mod.rs      | 12 ++++----
 proxy/src/control_plane/client/neon.rs     | 16 +++++------
 proxy/src/control_plane/mod.rs             | 12 ++++----
 proxy/src/proxy/connect_compute.rs         | 10 +++----
 proxy/src/proxy/handshake.rs               |  4 +--
 proxy/src/proxy/mod.rs                     |  6 ++--
 proxy/src/proxy/tests/mitm.rs              |  2 +-
 proxy/src/proxy/tests/mod.rs               | 27 +++++++++---------
 proxy/src/proxy/wake_compute.rs            |  4 +--
 proxy/src/serverless/backend.rs            | 16 +++++------
 proxy/src/serverless/conn_pool.rs          |  4 +--
 proxy/src/serverless/conn_pool_lib.rs      |  4 +--
 proxy/src/serverless/http_conn_pool.rs     |  6 ++--
 proxy/src/serverless/local_conn_pool.rs    |  6 ++--
 proxy/src/serverless/mod.rs                |  6 ++--
 proxy/src/serverless/sql_over_http.rs      | 12 ++++----
 proxy/src/serverless/websocket.rs          |  4 +--
 32 files changed, 162 insertions(+), 163 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 87a02133c8..491b272ac4 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,13 +5,13 @@ use super::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::backend::ComputeCredentialKeys;
 use crate::auth::{self, AuthFlow};
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::stream::{PqStream, Stream};
 use crate::{compute, sasl};
 
 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index e25dc3d45e..5772471486 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -8,7 +8,7 @@ use tracing::{info, info_span};
 use super::ComputeCredentialKeys;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
 use crate::proxy::connect_compute::ComputeConnectBackend;
@@ -71,7 +71,7 @@ impl ConsoleRedirectBackend {
 
     pub(crate) async fn authenticate(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         auth_config: &'static AuthenticationConfig,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
     ) -> auth::Result<ConsoleRedirectNodeInfo> {
@@ -87,7 +87,7 @@ pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo);
 impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
     async fn wake_compute(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
     ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
         Ok(Cached::new_uncached(self.0.clone()))
     }
@@ -98,7 +98,7 @@ impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
 }
 
 async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     auth_config: &'static AuthenticationConfig,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index e651df1d34..3316543022 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use tracing::{debug, info};
 use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
 use crate::auth::{self, AuthFlow};
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
 use crate::sasl;
@@ -15,7 +15,7 @@ use crate::stream::{self, Stream};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub(crate) async fn authenticate_cleartext(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
@@ -57,7 +57,7 @@ pub(crate) async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub(crate) async fn password_hack_no_authentication(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index bfc674139b..f721d81aa2 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -17,7 +17,7 @@ use thiserror::Error;
 use tokio::time::Instant;
 
 use crate::auth::backend::ComputeCredentialKeys;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::errors::GetEndpointJwksError;
 use crate::http::read_body_with_limit;
 use crate::intern::RoleNameInt;
@@ -39,7 +39,7 @@ const JWKS_FETCH_RETRIES: u32 = 3;
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> impl Future<Output = Result<Vec<AuthRule>, FetchAuthRulesError>> + Send;
 }
@@ -144,7 +144,7 @@ impl JwkCacheEntryLock {
     async fn renew_jwks<F: FetchAuthRules>(
         &self,
         _permit: JwkRenewalPermit<'_>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
         auth_rules: &F,
@@ -261,7 +261,7 @@ impl JwkCacheEntryLock {
 
     async fn get_or_update_jwk_cache<F: FetchAuthRules>(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
         fetch: &F,
@@ -314,7 +314,7 @@ impl JwkCacheEntryLock {
 
     async fn check_jwt<F: FetchAuthRules>(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         jwt: &str,
         client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
@@ -409,7 +409,7 @@ impl JwkCacheEntryLock {
 impl JwkCache {
     pub(crate) async fn check_jwt<F: FetchAuthRules>(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
         role_name: &RoleName,
         fetch: &F,
@@ -941,7 +941,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
     impl FetchAuthRules for Fetch {
         async fn fetch_auth_rules(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _endpoint: EndpointId,
         ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
             Ok(self.0.clone())
@@ -1039,7 +1039,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
             for token in &tokens {
                 jwk_cache
                     .check_jwt(
-                        &RequestMonitoring::test(),
+                        &RequestContext::test(),
                         endpoint.clone(),
                         role,
                         &fetch,
@@ -1097,7 +1097,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         jwk_cache
             .check_jwt(
-                &RequestMonitoring::test(),
+                &RequestContext::test(),
                 endpoint.clone(),
                 &role_name,
                 &fetch,
@@ -1136,7 +1136,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         let ep = EndpointId::from("ep");
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = jwk_cache
             .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt)
             .await
@@ -1175,7 +1175,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         // this role_name is not accepted
         let bad_role_name = RoleName::from("cloud_admin");
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = jwk_cache
             .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt)
             .await
@@ -1268,7 +1268,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         let ep = EndpointId::from("ep");
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         for test in table {
             let jwt = new_custom_ec_jwt("1".into(), &key, test.body);
 
@@ -1336,7 +1336,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         jwk_cache
             .check_jwt(
-                &RequestMonitoring::test(),
+                &RequestContext::test(),
                 endpoint.clone(),
                 &role_name,
                 &fetch,
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index f9cb085daf..32e0f53615 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -7,7 +7,7 @@ use super::jwt::{AuthRule, FetchAuthRules};
 use crate::auth::backend::jwt::FetchAuthRulesError;
 use crate::compute::ConnCfg;
 use crate::compute_ctl::ComputeCtlApi;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
 use crate::http;
@@ -56,7 +56,7 @@ pub static JWKS_ROLE_MAP: ArcSwapOption<EndpointJwksResponse> = ArcSwapOption::c
 impl FetchAuthRules for StaticAuthRules {
     async fn fetch_auth_rules(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         _endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
         let mappings = JWKS_ROLE_MAP.load();
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 83c72e7be0..57ecd5e499 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -20,7 +20,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::{
@@ -210,7 +210,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
     pub(crate) fn check_rate_limit(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         secret: AuthSecret,
         endpoint: &EndpointId,
         is_cleartext: bool,
@@ -265,7 +265,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     api: &impl control_plane::ControlPlaneApi,
     user_info: ComputeUserInfoMaybeEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -343,7 +343,7 @@ async fn auth_quirks(
 }
 
 async fn authenticate_with_secret(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     secret: AuthSecret,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -396,7 +396,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub(crate) async fn authenticate(
         self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
@@ -436,7 +436,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
 impl Backend<'_, ComputeUserInfo> {
     pub(crate) async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         match self {
             Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await,
@@ -446,7 +446,7 @@ impl Backend<'_, ComputeUserInfo> {
 
     pub(crate) async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         match self {
             Self::ControlPlane(api, user_info) => {
@@ -461,7 +461,7 @@ impl Backend<'_, ComputeUserInfo> {
 impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
         match self {
             Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
@@ -497,7 +497,7 @@ mod tests {
     use crate::auth::backend::MaskedIp;
     use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
     use crate::config::AuthenticationConfig;
-    use crate::context::RequestMonitoring;
+    use crate::context::RequestContext;
     use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
     use crate::proxy::NeonOptions;
     use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
@@ -513,7 +513,7 @@ mod tests {
     impl control_plane::ControlPlaneApi for Auth {
         async fn get_role_secret(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedRoleSecret, control_plane::errors::GetAuthInfoError> {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -521,7 +521,7 @@ mod tests {
 
         async fn get_allowed_ips_and_secret(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<
             (CachedAllowedIps, Option<CachedRoleSecret>),
@@ -535,7 +535,7 @@ mod tests {
 
         async fn get_endpoint_jwks(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _endpoint: crate::types::EndpointId,
         ) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
         {
@@ -544,7 +544,7 @@ mod tests {
 
         async fn wake_compute(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
             unimplemented!()
@@ -623,7 +623,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -700,7 +700,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -752,7 +752,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index dab9007400..f6bce9f2d8 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -10,7 +10,7 @@ use thiserror::Error;
 use tracing::{debug, warn};
 
 use crate::auth::password_hack::parse_endpoint_param;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, SniKind};
 use crate::proxy::NeonOptions;
@@ -86,7 +86,7 @@ pub(crate) fn endpoint_sni(
 
 impl ComputeUserInfoMaybeEndpoint {
     pub(crate) fn parse(
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         params: &StartupMessageParams,
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
@@ -260,7 +260,7 @@ mod tests {
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
@@ -275,7 +275,7 @@ mod tests {
             ("database", "world"), // should be ignored
             ("foo", "bar"),        // should be ignored
         ]);
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
@@ -290,7 +290,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
@@ -307,7 +307,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
@@ -322,7 +322,7 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
@@ -340,7 +340,7 @@ mod tests {
             ),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
@@ -355,7 +355,7 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
@@ -370,7 +370,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
@@ -385,14 +385,14 @@ mod tests {
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.a.com");
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
@@ -408,7 +408,7 @@ mod tests {
         let sni = Some("second.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
             .expect_err("should fail");
         match err {
@@ -427,7 +427,7 @@ mod tests {
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
             .expect_err("should fail");
         match err {
@@ -447,7 +447,7 @@ mod tests {
 
         let sni = Some("project.localhost");
         let common_names = Some(["localhost".into()].into());
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 1740b59b14..9c6ce151cb 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -11,7 +11,7 @@ use tracing::info;
 use super::backend::ComputeCredentialKeys;
 use super::{AuthError, PasswordHackPayload};
 use crate::config::TlsServerEndPoint;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
 use crate::sasl;
@@ -32,7 +32,7 @@ pub(crate) struct Begin;
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
 pub(crate) struct Scram<'a>(
     pub(crate) &'a scram::ServerSecret,
-    pub(crate) &'a RequestMonitoring,
+    pub(crate) &'a RequestContext,
 );
 
 impl AuthMethod for Scram<'_> {
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index ef5b5e8509..623a0fd3b2 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -11,7 +11,7 @@ use futures::future::Either;
 use futures::TryFutureExt;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
-use proxy::context::RequestMonitoring;
+use proxy::context::RequestContext;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::protocol2::ConnectionInfo;
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
@@ -177,7 +177,7 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                let ctx = RequestMonitoring::new(
+                let ctx = RequestContext::new(
                     session_id,
                     ConnectionInfo {
                         addr: peer_addr,
@@ -208,7 +208,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -259,7 +259,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
-    ctx: RequestMonitoring,
+    ctx: RequestContext,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 07769e053c..20db1fbb14 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::info;
 
 use crate::config::EndpointCacheConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 use crate::rate_limiter::GlobalRateLimiter;
@@ -75,7 +75,7 @@ impl EndpointsCache {
         }
     }
 
-    pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             // the endpoint cache is not yet fully initialised.
             return true;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b8876b44eb..e7fbe9ab47 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -18,7 +18,7 @@ use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
 use crate::cancellation::CancelClosure;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::MetricsAuxInfo;
@@ -286,7 +286,7 @@ impl ConnCfg {
     /// Connect to a corresponding compute node.
     pub(crate) async fn connect(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         allow_self_signed_compute: bool,
         aux: MetricsAuxInfo,
         timeout: Duration,
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 8e71f552a5..c88b2936db 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -8,7 +8,7 @@ use tracing::{debug, error, info, Instrument};
 use crate::auth::backend::ConsoleRedirectBackend;
 use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
 use crate::config::{ProxyConfig, ProxyProtocolV2};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
 use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
@@ -82,7 +82,7 @@ pub async fn task_main(
                 }
             };
 
-            let ctx = RequestMonitoring::new(
+            let ctx = RequestContext::new(
                 session_id,
                 peer_addr,
                 crate::metrics::Protocol::Tcp,
@@ -141,7 +141,7 @@ pub async fn task_main(
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     backend: &'static ConsoleRedirectBackend,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index d057ee0bfd..6d2d2d51ce 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -32,15 +32,15 @@ pub(crate) static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<Reques
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
 /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
-pub struct RequestMonitoring(
+pub struct RequestContext(
     /// To allow easier use of the ctx object, we have interior mutability.
     /// I would typically use a RefCell but that would break the `Send` requirements
     /// so we need something with thread-safety. `TryLock` is a cheap alternative
     /// that offers similar semantics to a `RefCell` but with synchronisation.
-    TryLock<RequestMonitoringInner>,
+    TryLock<RequestContextInner>,
 );
 
-struct RequestMonitoringInner {
+struct RequestContextInner {
     pub(crate) conn_info: ConnectionInfo,
     pub(crate) session_id: Uuid,
     pub(crate) protocol: Protocol,
@@ -81,10 +81,10 @@ pub(crate) enum AuthMethod {
     Cleartext,
 }
 
-impl Clone for RequestMonitoring {
+impl Clone for RequestContext {
     fn clone(&self) -> Self {
         let inner = self.0.try_lock().expect("should not deadlock");
-        let new = RequestMonitoringInner {
+        let new = RequestContextInner {
             conn_info: inner.conn_info.clone(),
             session_id: inner.session_id,
             protocol: inner.protocol,
@@ -115,7 +115,7 @@ impl Clone for RequestMonitoring {
     }
 }
 
-impl RequestMonitoring {
+impl RequestContext {
     pub fn new(
         session_id: Uuid,
         conn_info: ConnectionInfo,
@@ -132,7 +132,7 @@ impl RequestMonitoring {
             role = tracing::field::Empty,
         );
 
-        let inner = RequestMonitoringInner {
+        let inner = RequestContextInner {
             conn_info,
             session_id,
             protocol,
@@ -168,7 +168,7 @@ impl RequestMonitoring {
         let ip = IpAddr::from([127, 0, 0, 1]);
         let addr = SocketAddr::new(ip, 5432);
         let conn_info = ConnectionInfo { addr, extra: None };
-        RequestMonitoring::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
+        RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
     }
 
     pub(crate) fn console_application_name(&self) -> String {
@@ -325,7 +325,7 @@ impl RequestMonitoring {
 }
 
 pub(crate) struct LatencyTimerPause<'a> {
-    ctx: &'a RequestMonitoring,
+    ctx: &'a RequestContext,
     start: tokio::time::Instant,
     waiting_for: Waiting,
 }
@@ -341,7 +341,7 @@ impl Drop for LatencyTimerPause<'_> {
     }
 }
 
-impl RequestMonitoringInner {
+impl RequestContextInner {
     fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -430,7 +430,7 @@ impl RequestMonitoringInner {
     }
 }
 
-impl Drop for RequestMonitoringInner {
+impl Drop for RequestContextInner {
     fn drop(&mut self) {
         if self.sender.is_some() {
             self.log_connect();
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 4112de646f..9bf3a275bb 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -20,7 +20,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use super::{RequestMonitoringInner, LOG_CHAN};
+use super::{RequestContextInner, LOG_CHAN};
 use crate::config::remote_storage_from_toml;
 use crate::context::LOG_CHAN_DISCONNECT;
 
@@ -117,8 +117,8 @@ impl serde::Serialize for Options<'_> {
     }
 }
 
-impl From<&RequestMonitoringInner> for RequestData {
-    fn from(value: &RequestMonitoringInner) -> Self {
+impl From<&RequestContextInner> for RequestData {
+    fn from(value: &RequestContextInner) -> Self {
         Self {
             session_id: value.session_id,
             peer_addr: value.conn_info.addr.ip().to_string(),
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index fd333d2aac..500acad50f 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -13,7 +13,7 @@ use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
 use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
@@ -206,7 +206,7 @@ impl super::ControlPlaneApi for MockControlPlane {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         Ok(CachedRoleSecret::new_uncached(
@@ -216,7 +216,7 @@ impl super::ControlPlaneApi for MockControlPlane {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         Ok((
@@ -229,7 +229,7 @@ impl super::ControlPlaneApi for MockControlPlane {
 
     async fn get_endpoint_jwks(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(endpoint).await
@@ -238,7 +238,7 @@ impl super::ControlPlaneApi for MockControlPlane {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         self.do_wake_compute().map_ok(Cached::new_uncached).await
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index 50903e2f1e..f8f74372f0 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -15,7 +15,7 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::cache::endpoints::EndpointsCache;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::{
     errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
 };
@@ -41,7 +41,7 @@ pub enum ControlPlaneClient {
 impl ControlPlaneApi for ControlPlaneClient {
     async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         match self {
@@ -57,7 +57,7 @@ impl ControlPlaneApi for ControlPlaneClient {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         match self {
@@ -71,7 +71,7 @@ impl ControlPlaneApi for ControlPlaneClient {
 
     async fn get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
@@ -85,7 +85,7 @@ impl ControlPlaneApi for ControlPlaneClient {
 
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         match self {
@@ -271,7 +271,7 @@ impl WakeComputePermit {
 impl FetchAuthRules for ControlPlaneClient {
     async fn fetch_auth_rules(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
         self.get_endpoint_jwks(ctx, endpoint)
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 8f4ae13f33..53f9234926 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -14,7 +14,7 @@ use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeComput
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::ComputeUserInfo;
 use crate::cache::Cached;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::caches::ApiCaches;
 use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
@@ -65,7 +65,7 @@ impl NeonControlPlaneClient {
 
     async fn do_get_auth_info(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         if !self
@@ -141,7 +141,7 @@ impl NeonControlPlaneClient {
 
     async fn do_get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         if !self
@@ -200,7 +200,7 @@ impl NeonControlPlaneClient {
 
     async fn do_wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
         let request_id = ctx.session_id().to_string();
@@ -263,7 +263,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -297,7 +297,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -339,7 +339,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     #[tracing::instrument(skip_all)]
     async fn get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(ctx, endpoint).await
@@ -348,7 +348,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 70607ac0d0..41972e4e44 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -17,7 +17,7 @@ use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::IpPattern;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::cache::{Cached, TimedLru};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
 use crate::intern::ProjectIdInt;
 use crate::types::{EndpointCacheKey, EndpointId};
@@ -75,7 +75,7 @@ pub(crate) struct NodeInfo {
 impl NodeInfo {
     pub(crate) async fn connect(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         timeout: Duration,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
         self.config
@@ -116,26 +116,26 @@ pub(crate) trait ControlPlaneApi {
     /// We still have to mock the scram to avoid leaking information that user doesn't exist.
     async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
     async fn get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 659b7afa68..b30aec09c1 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -7,7 +7,7 @@ use super::retry::ShouldRetryWakeCompute;
 use crate::auth::backend::ComputeCredentialKeys;
 use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
 use crate::config::RetryConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
@@ -47,7 +47,7 @@ pub(crate) trait ConnectMechanism {
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &control_plane::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError>;
@@ -59,7 +59,7 @@ pub(crate) trait ConnectMechanism {
 pub(crate) trait ComputeConnectBackend {
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
 
     fn get_keys(&self) -> &ComputeCredentialKeys;
@@ -82,7 +82,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &control_plane::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
@@ -99,7 +99,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
 pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index a67f1b8112..3ada3a9995 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -9,7 +9,7 @@ use tracing::{info, warn};
 
 use crate::auth::endpoint_sni;
 use crate::config::{TlsConfig, PG_ALPN_PROTOCOL};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::Metrics;
 use crate::proxy::ERR_INSECURE_CONNECTION;
@@ -66,7 +66,7 @@ pub(crate) enum HandshakeData<S> {
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
 pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     stream: S,
     mut tls: Option<&TlsConfig>,
     record_handshake_error: bool,
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 17721c23d5..4be4006d15 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -25,7 +25,7 @@ use self::connect_compute::{connect_to_compute, TcpMechanism};
 use self::passthrough::ProxyPassthrough;
 use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
 use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
 use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
@@ -117,7 +117,7 @@ pub async fn task_main(
                 }
             };
 
-            let ctx = RequestMonitoring::new(
+            let ctx = RequestContext::new(
                 session_id,
                 conn_info,
                 crate::metrics::Protocol::Tcp,
@@ -247,7 +247,7 @@ impl ReportableError for ClientRequestError {
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     auth_backend: &'static auth::Backend<'static, ()>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index df9f79a7e3..fe211adfeb 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -36,7 +36,7 @@ async fn proxy_mitm(
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
         let (end_client, startup) = match handshake(
-            &RequestMonitoring::test(),
+            &RequestContext::test(),
             client1,
             Some(&server_config1),
             false,
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index be821925b5..3de8ca8736 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -162,7 +162,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0, &RequestMonitoring::test()))
+            .begin(auth::Scram(&self.0, &RequestContext::test()))
             .await?
             .authenticate()
             .await?;
@@ -182,11 +182,10 @@ async fn dummy_proxy(
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
     let (client, _) = read_proxy_protocol(client).await?;
-    let mut stream =
-        match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? {
-            HandshakeData::Startup(stream, _) => stream,
-            HandshakeData::Cancel(_) => bail!("cancellation not supported"),
-        };
+    let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? {
+        HandshakeData::Startup(stream, _) => stream,
+        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+    };
 
     auth.authenticate(&mut stream).await?;
 
@@ -466,7 +465,7 @@ impl ConnectMechanism for TestConnectMechanism {
 
     async fn connect_once(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         _node_info: &control_plane::CachedNodeInfo,
         _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -581,7 +580,7 @@ fn helper_create_connect_info(
 async fn connect_to_compute_success() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -599,7 +598,7 @@ async fn connect_to_compute_success() {
 async fn connect_to_compute_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -618,7 +617,7 @@ async fn connect_to_compute_retry() {
 async fn connect_to_compute_non_retry_1() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -637,7 +636,7 @@ async fn connect_to_compute_non_retry_1() {
 async fn connect_to_compute_non_retry_2() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -657,7 +656,7 @@ async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
     tokio::time::pause();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism =
         TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
@@ -689,7 +688,7 @@ async fn connect_to_compute_non_retry_3() {
 async fn wake_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -708,7 +707,7 @@ async fn wake_retry() {
 async fn wake_non_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index f9f46bb66c..d09e0b1f41 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -2,7 +2,7 @@ use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::CachedNodeInfo;
 use crate::error::ReportableError;
@@ -13,7 +13,7 @@ use crate::proxy::retry::{retry_after, should_retry};
 
 pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     api: &B,
     config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 5e9fd151ae..d9dcf6fbb7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -23,7 +23,7 @@ use crate::compute_ctl::{
     ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
 };
 use crate::config::ProxyConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
 use crate::control_plane::locks::ApiLocks;
@@ -48,7 +48,7 @@ pub(crate) struct PoolingBackend {
 impl PoolingBackend {
     pub(crate) async fn authenticate_with_password(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
         password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
@@ -110,7 +110,7 @@ impl PoolingBackend {
 
     pub(crate) async fn authenticate_with_jwt(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
         jwt: String,
     ) -> Result<ComputeCredentials, AuthError> {
@@ -161,7 +161,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub(crate) async fn connect_to_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
@@ -201,7 +201,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub(crate) async fn connect_to_local_proxy(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: ConnInfo,
     ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
         info!("pool: looking for an existing connection");
@@ -249,7 +249,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub(crate) async fn connect_to_local_postgres(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: ConnInfo,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
@@ -490,7 +490,7 @@ impl ConnectMechanism for TokioMechanism {
 
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -540,7 +540,7 @@ impl ConnectMechanism for HyperMechanism {
 
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 1845603bf7..07ba1ae9af 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -21,7 +21,7 @@ use {
 use super::conn_pool_lib::{
     Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool,
 };
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::Metrics;
 
@@ -53,7 +53,7 @@ impl fmt::Display for ConnInfo {
 
 pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 61c39c32c9..fe3c422c3b 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -15,7 +15,7 @@ use super::conn_pool::ClientDataRemote;
 use super::http_conn_pool::ClientDataHttp;
 use super::local_conn_pool::ClientDataLocal;
 use crate::auth::backend::ComputeUserInfo;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::types::{DbName, EndpointCacheKey, RoleName};
@@ -380,7 +380,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
     pub(crate) fn get(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInnerCommon<C>> = None;
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index a1d4473b01..bc86c4b1cd 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -12,7 +12,7 @@ use tracing::{debug, error, info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::types::EndpointCacheKey;
@@ -212,7 +212,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
     #[expect(unused_results)]
     pub(crate) fn get(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let result: Result<Option<Client<C>>, HttpConnError>;
@@ -280,7 +280,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
 
 pub(crate) fn poll_http2_client(
     global_pool: Arc<GlobalConnPool<Send>>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     conn_info: &ConnInfo,
     client: Send,
     connection: Connect,
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 99d4329f88..cadcbd7530 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -36,7 +36,7 @@ use super::conn_pool_lib::{
     Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn,
     EndpointConnPool,
 };
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 
@@ -88,7 +88,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
 
     pub(crate) fn get(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let client = self
@@ -159,7 +159,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<LocalConnPool<C>>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index cf758855fa..59247f03bf 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -45,7 +45,7 @@ use utils::http::error::ApiError;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::metrics::Metrics;
 use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo};
 use crate::proxy::run_until_cancelled;
@@ -423,7 +423,7 @@ async fn request_handler(
     if config.http_config.accept_websockets
         && framed_websockets::upgrade::is_upgrade_request(&request)
     {
-        let ctx = RequestMonitoring::new(
+        let ctx = RequestContext::new(
             session_id,
             conn_info,
             crate::metrics::Protocol::Ws,
@@ -458,7 +458,7 @@ async fn request_handler(
         // Return the response so the spawned future can continue.
         Ok(response.map(|b| b.map_err(|x| match x {}).boxed()))
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
-        let ctx = RequestMonitoring::new(
+        let ctx = RequestContext::new(
             session_id,
             conn_info,
             crate::metrics::Protocol::Http,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f0975617d4..36d8595902 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -34,7 +34,7 @@ use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
@@ -133,7 +133,7 @@ impl UserFacingError for ConnInfoError {
 
 fn get_conn_info(
     config: &'static AuthenticationConfig,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
 ) -> Result<ConnInfoWithAuth, ConnInfoError> {
@@ -240,7 +240,7 @@ fn get_conn_info(
 
 pub(crate) async fn handle(
     config: &'static ProxyConfig,
-    ctx: RequestMonitoring,
+    ctx: RequestContext,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
@@ -516,7 +516,7 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue>
 async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
@@ -562,7 +562,7 @@ async fn handle_inner(
 async fn handle_db_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     request: Request<Incoming>,
     conn_info: ConnInfo,
     auth: AuthData,
@@ -733,7 +733,7 @@ pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue {
 }
 
 async fn handle_auth_broker_inner(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     request: Request<Incoming>,
     conn_info: ConnInfo,
     jwt: String,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index ba36116c2c..4088fea835 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -14,7 +14,7 @@ use tracing::warn;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::{io_error, ReportableError};
 use crate::metrics::Metrics;
 use crate::proxy::{handle_client, ClientMode, ErrorSource};
@@ -126,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
     auth_backend: &'static crate::auth::Backend<'static, ()>,
-    ctx: RequestMonitoring,
+    ctx: RequestContext,
     websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,

From 593e35027a088cadeb74b8c6e6f08877495986b3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 14:57:59 +0000
Subject: [PATCH 11/51] tests: use fewer pageservers in
 test_sharding_split_smoke (#9804)

## Problem

This test uses a gratuitous number of pageservers (16). This works fine
when there are plenty of system resources, but causes issues on test
runners that have limited resources and run many tests concurrently.

Related: https://github.com/neondatabase/neon/issues/9802

## Summary of changes

- Split from 2 shards to 4, instead of 4 to 8
- Don't give every shard a separate pageserver, let two locations share
each pageserver.

Net result is 4 pageservers instead of 16
---
 test_runner/regress/test_sharding.py | 39 ++++++++++------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 0a4a53356d..84737fc81e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -515,11 +515,12 @@ def test_sharding_split_smoke(
 
     """
 
-    # We will start with 4 shards and split into 8, then migrate all those
-    # 8 shards onto separate pageservers
-    shard_count = 4
-    split_shard_count = 8
-    neon_env_builder.num_pageservers = split_shard_count * 2
+    # Shard count we start with
+    shard_count = 2
+    # Shard count we split into
+    split_shard_count = 4
+    # We will have 2 shards per pageserver once done (including secondaries)
+    neon_env_builder.num_pageservers = split_shard_count
 
     # 1MiB stripes: enable getting some meaningful data distribution without
     # writing large quantities of data in this test.  The stripe size is given
@@ -591,7 +592,7 @@ def test_sharding_split_smoke(
 
     workload.validate()
 
-    assert len(pre_split_pageserver_ids) == 4
+    assert len(pre_split_pageserver_ids) == shard_count
 
     def shards_on_disk(shard_ids):
         for pageserver in env.pageservers:
@@ -654,9 +655,9 @@ def test_sharding_split_smoke(
     # - shard_count reconciles for the original setup of the tenant
     # - shard_count reconciles for detaching the original secondary locations during split
     # - split_shard_count reconciles during shard splitting, for setting up secondaries.
-    # - shard_count of the child shards will need to fail over to their secondaries
-    # - shard_count of the child shard secondary locations will get moved to emptier nodes
-    expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2
+    # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move)
+    expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2
+
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
@@ -720,22 +721,10 @@ def test_sharding_split_smoke(
     # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
-        1: 1,
-        2: 1,
-        3: 1,
-        4: 1,
-        5: 1,
-        6: 1,
-        7: 1,
-        8: 1,
-        9: 1,
-        10: 1,
-        11: 1,
-        12: 1,
-        13: 1,
-        14: 1,
-        15: 1,
-        16: 1,
+        1: 2,
+        2: 2,
+        3: 2,
+        4: 2,
     }
 
     # The controller is not required to lay out the attached locations in any particular way, but

From 67f5f83edcf50e14fb269cd8919bbda3601fdaf0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 15:56:14 +0000
Subject: [PATCH 12/51] pageserver: avoid reading SLRU blocks for GC on shards
 >0  (#9423)

## Problem

SLRU blocks, which can add up to several gigabytes, are currently
ingested by all shards, multiplying their capacity cost by the shard
count and slowing down ingest. We do this because all shards need the
SLRU pages to do timestamp->LSN lookup for GC.

Related: https://github.com/neondatabase/neon/issues/7512

## Summary of changes

- On non-zero shards, learn the GC offset from shard 0's index instead
of calculating it.
- Add a test `test_sharding_gc` that exercises this
- Do GC in test_pg_regress as a general smoke test that GC functions run
(e.g. this would fail if we were using SLRUs we didn't have)

In this PR we are still ingesting SLRUs everywhere, but not using them
any more. Part 2 PR (https://github.com/neondatabase/neon/pull/9786)
makes the change to not store them at all.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../src/tenant/remote_timeline_client.rs      |  23 ++++
 pageserver/src/tenant/timeline.rs             | 116 +++++++++++++-----
 test_runner/fixtures/remote_storage.py        |  16 ++-
 test_runner/regress/test_pg_regress.py        |   4 +-
 test_runner/regress/test_sharding.py          | 110 ++++++++++++++++-
 5 files changed, 228 insertions(+), 41 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 94f42c7827..b910a40547 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -197,6 +197,7 @@ use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
 use utils::pausable_failpoint;
+use utils::shard::ShardNumber;
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
@@ -2231,6 +2232,28 @@ impl RemoteTimelineClient {
             UploadQueue::Initialized(x) => x.no_pending_work(),
         }
     }
+
+    /// 'foreign' in the sense that it does not belong to this tenant shard.  This method
+    /// is used during GC for other shards to get the index of shard zero.
+    pub(crate) async fn download_foreign_index(
+        &self,
+        shard_number: ShardNumber,
+        cancel: &CancellationToken,
+    ) -> Result<(IndexPart, Generation, std::time::SystemTime), DownloadError> {
+        let foreign_shard_id = TenantShardId {
+            shard_number,
+            shard_count: self.tenant_shard_id.shard_count,
+            tenant_id: self.tenant_shard_id.tenant_id,
+        };
+        download_index_part(
+            &self.storage_impl,
+            &foreign_shard_id,
+            &self.timeline_id,
+            Generation::MAX,
+            cancel,
+        )
+        .await
+    }
 }
 
 pub(crate) struct UploadQueueAccessor<'a> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0eb3de21e9..a4289a222f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,6 +38,7 @@ use pageserver_api::{
     shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
+use remote_storage::DownloadError;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
 use tokio::{
@@ -4821,6 +4822,86 @@ impl Timeline {
         Ok(())
     }
 
+    async fn find_gc_time_cutoff(
+        &self,
+        pitr: Duration,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<Option<Lsn>, PageReconstructError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        if self.shard_identity.is_shard_zero() {
+            // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself
+            let now = SystemTime::now();
+            let time_range = if pitr == Duration::ZERO {
+                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            } else {
+                pitr
+            };
+
+            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
+            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
+            let timestamp = to_pg_timestamp(time_cutoff);
+
+            let time_cutoff = match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
+                LsnForTimestamp::Present(lsn) => Some(lsn),
+                LsnForTimestamp::Future(lsn) => {
+                    // The timestamp is in the future. That sounds impossible,
+                    // but what it really means is that there hasn't been
+                    // any commits since the cutoff timestamp.
+                    //
+                    // In this case we should use the LSN of the most recent commit,
+                    // which is implicitly the last LSN in the log.
+                    debug!("future({})", lsn);
+                    Some(self.get_last_record_lsn())
+                }
+                LsnForTimestamp::Past(lsn) => {
+                    debug!("past({})", lsn);
+                    None
+                }
+                LsnForTimestamp::NoData(lsn) => {
+                    debug!("nodata({})", lsn);
+                    None
+                }
+            };
+            Ok(time_cutoff)
+        } else {
+            // Shards other than shard zero cannot do timestamp->lsn lookups, and must instead learn their GC cutoff
+            // from shard zero's index.  The index doesn't explicitly tell us the time cutoff, but we may assume that
+            // the point up to which shard zero's last_gc_cutoff has advanced will either be the time cutoff, or a
+            // space cutoff that we would also have respected ourselves.
+            match self
+                .remote_client
+                .download_foreign_index(ShardNumber(0), cancel)
+                .await
+            {
+                Ok((index_part, index_generation, _index_mtime)) => {
+                    tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}",
+                        index_part.metadata.latest_gc_cutoff_lsn());
+                    Ok(Some(index_part.metadata.latest_gc_cutoff_lsn()))
+                }
+                Err(DownloadError::NotFound) => {
+                    // This is unexpected, because during timeline creations shard zero persists to remote
+                    // storage before other shards are called, and during timeline deletion non-zeroth shards are
+                    // deleted before the zeroth one.  However, it should be harmless: if we somehow end up in this
+                    // state, then shard zero should _eventually_ write an index when it GCs.
+                    tracing::warn!("GC couldn't find shard zero's index for timeline");
+                    Ok(None)
+                }
+                Err(e) => {
+                    // TODO: this function should return a different error type than page reconstruct error
+                    Err(PageReconstructError::Other(anyhow::anyhow!(e)))
+                }
+            }
+
+            // TODO: after reading shard zero's GC cutoff, we should validate its generation with the storage
+            // controller.  Otherwise, it is possible that we see the GC cutoff go backwards while shard zero
+            // is going through a migration if we read the old location's index and it has GC'd ahead of the
+            // new location.  This is legal in principle, but problematic in practice because it might result
+            // in a timeline creation succeeding on shard zero ('s new location) but then failing on other shards
+            // because they have GC'd past the branch point.
+        }
+    }
+
     /// Find the Lsns above which layer files need to be retained on
     /// garbage collection.
     ///
@@ -4863,40 +4944,7 @@ impl Timeline {
         // - if PITR interval is set, then this is our cutoff.
         // - if PITR interval is not set, then we do a lookup
         //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
-        let time_cutoff = {
-            let now = SystemTime::now();
-            let time_range = if pitr == Duration::ZERO {
-                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
-            } else {
-                pitr
-            };
-
-            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
-            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
-            let timestamp = to_pg_timestamp(time_cutoff);
-
-            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
-                LsnForTimestamp::Present(lsn) => Some(lsn),
-                LsnForTimestamp::Future(lsn) => {
-                    // The timestamp is in the future. That sounds impossible,
-                    // but what it really means is that there hasn't been
-                    // any commits since the cutoff timestamp.
-                    //
-                    // In this case we should use the LSN of the most recent commit,
-                    // which is implicitly the last LSN in the log.
-                    debug!("future({})", lsn);
-                    Some(self.get_last_record_lsn())
-                }
-                LsnForTimestamp::Past(lsn) => {
-                    debug!("past({})", lsn);
-                    None
-                }
-                LsnForTimestamp::NoData(lsn) => {
-                    debug!("nodata({})", lsn);
-                    None
-                }
-            }
-        };
+        let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?;
 
         Ok(match (pitr, time_cutoff) {
             (Duration::ZERO, Some(time_cutoff)) => {
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 7024953661..c630ea98b4 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -77,14 +77,16 @@ class MockS3Server:
 class LocalFsStorage:
     root: Path
 
-    def tenant_path(self, tenant_id: TenantId) -> Path:
+    def tenant_path(self, tenant_id: Union[TenantId, TenantShardId]) -> Path:
         return self.root / "tenants" / str(tenant_id)
 
-    def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+    def timeline_path(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> Path:
         return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
 
     def timeline_latest_generation(
-        self, tenant_id: TenantId, timeline_id: TimelineId
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
     ) -> Optional[int]:
         timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id))
         index_parts = [f for f in timeline_files if f.startswith("index_part")]
@@ -102,7 +104,9 @@ class LocalFsStorage:
             raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}")
         return generations[-1]
 
-    def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+    def index_path(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> Path:
         latest_gen = self.timeline_latest_generation(tenant_id, timeline_id)
         if latest_gen is None:
             filename = TIMELINE_INDEX_PART_FILE_NAME
@@ -126,7 +130,9 @@ class LocalFsStorage:
         filename = f"{local_name}-{generation:08x}"
         return self.timeline_path(tenant_id, timeline_id) / filename
 
-    def index_content(self, tenant_id: TenantId, timeline_id: TimelineId) -> Any:
+    def index_content(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> Any:
         with self.index_path(tenant_id, timeline_id).open("r") as f:
             return json.load(f)
 
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index f4698191eb..6a5e388c53 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -110,13 +110,15 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
 
     check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
 
-    # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+    # Ensure that compaction/GC works, on a timeline containing all the diversity that postgres regression tests create.
     # There should have been compactions mid-test as well, this final check is in addition those.
     for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
         pageserver.http_client().timeline_checkpoint(
             shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
         )
 
+        pageserver.http_client().timeline_gc(shard, env.initial_timeline, None)
+
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 84737fc81e..3194fe6ec4 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -19,7 +19,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
-from fixtures.remote_storage import s3_storage
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, s3_storage
 from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
@@ -1674,3 +1674,111 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder):
     )
     assert len(top["shards"]) == n_tenants - 4
     assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:])
+
+
+def test_sharding_gc(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Exercise GC in a sharded tenant: because only shard 0 holds SLRU content, it acts as
+    the "leader" for GC, and other shards read its index to learn what LSN they should
+    GC up to.
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
+        # A short PITR horizon, so that we won't have to sleep too long in the test to wait for it to
+        # happen.
+        "pitr_interval": "1s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # Disable automatic creation of image layers, as we will create them explicitly when we want them
+        "image_creation_threshold": 9999,
+        "image_layer_creation_check_threshold": 0,
+        "lsn_lease_length": "0s",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_conf=TENANT_CONF
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Create a branch and write some data
+    workload = Workload(env, tenant_id, timeline_id)
+    initial_lsn = Lsn(workload.endpoint().safe_psql("SELECT pg_current_wal_lsn()")[0][0])
+    log.info(f"Started at LSN: {initial_lsn}")
+
+    workload.init()
+
+    # Write enough data to generate multiple layers
+    for _i in range(10):
+        last_lsn = workload.write_rows(32)
+
+    assert last_lsn > initial_lsn
+
+    log.info(f"Wrote up to last LSN: {last_lsn}")
+
+    # Do full image layer generation. When we subsequently wait for PITR, all historic deltas
+    # should be GC-able
+    for shard_number in range(shard_count):
+        shard = TenantShardId(tenant_id, shard_number, shard_count)
+        env.get_tenant_pageserver(shard).http_client().timeline_compact(
+            shard, timeline_id, force_image_layer_creation=True
+        )
+
+    workload.churn_rows(32)
+
+    time.sleep(5)
+
+    # Invoke GC on a non-zero shard and verify its GC cutoff LSN does not advance
+    shard_one = TenantShardId(tenant_id, 1, shard_count)
+    env.get_tenant_pageserver(shard_one).http_client().timeline_gc(
+        shard_one, timeline_id, gc_horizon=None
+    )
+
+    # Check shard 1's index - GC cutoff LSN should not have advanced
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    shard_1_index = env.pageserver_remote_storage.index_content(
+        tenant_id=shard_one, timeline_id=timeline_id
+    )
+    shard_1_gc_cutoff_lsn = Lsn(shard_1_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
+    log.info(f"Shard 1 cutoff LSN: {shard_1_gc_cutoff_lsn}")
+    assert shard_1_gc_cutoff_lsn <= last_lsn
+
+    shard_zero = TenantShardId(tenant_id, 0, shard_count)
+    env.get_tenant_pageserver(shard_zero).http_client().timeline_gc(
+        shard_zero, timeline_id, gc_horizon=None
+    )
+
+    # TODO: observe that GC LSN of shard 0 has moved forward in remote storage
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    shard_0_index = env.pageserver_remote_storage.index_content(
+        tenant_id=shard_zero, timeline_id=timeline_id
+    )
+    shard_0_gc_cutoff_lsn = Lsn(shard_0_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
+    log.info(f"Shard 0 cutoff LSN: {shard_0_gc_cutoff_lsn}")
+    assert shard_0_gc_cutoff_lsn >= last_lsn
+
+    # Invoke GC on all other shards and verify their GC cutoff LSNs
+    for shard_number in range(1, shard_count):
+        shard = TenantShardId(tenant_id, shard_number, shard_count)
+        env.get_tenant_pageserver(shard).http_client().timeline_gc(
+            shard, timeline_id, gc_horizon=None
+        )
+
+        # Verify GC cutoff LSN advanced to match shard 0
+        shard_index = env.pageserver_remote_storage.index_content(
+            tenant_id=shard, timeline_id=timeline_id
+        )
+        shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
+        log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}")
+        assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn

From 5ff2f1ee7d2b4b2b6dfb017dfecd5ae3f59cb404 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 17:31:55 +0000
Subject: [PATCH 13/51] pageserver: enable compaction to proceed while
 live-migrating (#5397)

## Problem

Long ago, in #5299 the tenant states for migration are added, but
respected only in a coarse-grained way: when hinted not to do deletions,
tenants will just avoid doing all GC or compaction.

Skipping compaction is not necessary for AttachedMulti, as we will soon
become the primary attached location, and it is not a waste of resources
to proceed with compaction. Instead, per the RFC
https://github.com/neondatabase/neon/pull/5029/files), deletions should
be queued up in this state, and executed later when we switch to
AttachedSingle.

Avoiding compaction in AttachedMulti can have an operational impact if a
tenant is under significant write load, as a long-running migration can
result in a large accumulation of delta layers with commensurate impact
on read latency.

Closes: https://github.com/neondatabase/neon/issues/5396

## Summary of changes

- Add a 'config' part to RemoteTimelineClient so that it can be aware of
the mode of the tenant it belongs to, and wire this through for
construction + updates
- Add a special buffer for delayed deletions, and when in AttachedMulti
route deletions here instead of into the main remote client queue. This
is drained when transitioning to AttachedSingle. If the tenant is
detached or our process dies before then, then these objects are leaked.
- As a quality of life improvement, also use the remote timeline
client's knowledge of the tenant state to avoid submitting remote
consistent LSN updates for validation when in AttachedStale (as we know
these will fail)

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pageserver/src/tenant.rs                      |  25 +++-
 .../src/tenant/remote_timeline_client.rs      | 122 ++++++++++++++++--
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/tenant/timeline/delete.rs      |   2 +-
 pageserver/src/tenant/upload_queue.rs         |   7 +-
 .../regress/test_pageserver_secondary.py      |  24 ++++
 6 files changed, 167 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8e9e3890ba..2e5f69e3c9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -189,6 +189,7 @@ pub struct TenantSharedResources {
 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
+#[derive(Clone)]
 pub(super) struct AttachedTenantConf {
     tenant_conf: TenantConfOpt,
     location: AttachedLocationConfig,
@@ -1807,6 +1808,7 @@ impl Tenant {
             self.tenant_shard_id,
             timeline_id,
             self.generation,
+            &self.tenant_conf.load().location,
         )
     }
 
@@ -2527,6 +2529,10 @@ impl Tenant {
         {
             let conf = self.tenant_conf.load();
 
+            // If we may not delete layers, then simply skip GC.  Even though a tenant
+            // in AttachedMulti state could do GC and just enqueue the blocked deletions,
+            // the only advantage to doing it is to perhaps shrink the LayerMap metadata
+            // a bit sooner than we would achieve by waiting for AttachedSingle status.
             if !conf.location.may_delete_layers_hint() {
                 info!("Skipping GC in location state {:?}", conf.location);
                 return Ok(GcResult::default());
@@ -2568,7 +2574,14 @@ impl Tenant {
 
         {
             let conf = self.tenant_conf.load();
-            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
+
+            // Note that compaction usually requires deletions, but we don't respect
+            // may_delete_layers_hint here: that is because tenants in AttachedMulti
+            // should proceed with compaction even if they can't do deletion, to avoid
+            // accumulating dangerously deep stacks of L0 layers.  Deletions will be
+            // enqueued inside RemoteTimelineClient, and executed layer if/when we transition
+            // to AttachedSingle state.
+            if !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
                 return Ok(false);
             }
@@ -3446,6 +3459,7 @@ impl Tenant {
         // this race is not possible if both request types come from the storage
         // controller (as they should!) because an exclusive op lock is required
         // on the storage controller side.
+
         self.tenant_conf.rcu(|inner| {
             Arc::new(AttachedTenantConf {
                 tenant_conf: new_tenant_conf.clone(),
@@ -3455,20 +3469,22 @@ impl Tenant {
             })
         });
 
+        let updated = self.tenant_conf.load().clone();
+
         self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated(&updated);
         }
     }
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
         let new_tenant_conf = new_conf.tenant_conf.clone();
 
-        self.tenant_conf.store(Arc::new(new_conf));
+        self.tenant_conf.store(Arc::new(new_conf.clone()));
 
         self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
@@ -3476,7 +3492,7 @@ impl Tenant {
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated(&new_conf);
         }
     }
 
@@ -4544,6 +4560,7 @@ impl Tenant {
             self.tenant_shard_id,
             timeline_id,
             self.generation,
+            &self.tenant_conf.load().location,
         )
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b910a40547..377bc23542 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,6 +241,7 @@ use utils::id::{TenantId, TimelineId};
 
 use self::index::IndexPart;
 
+use super::config::AttachedLocationConfig;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
@@ -302,6 +303,36 @@ pub enum WaitCompletionError {
 #[derive(Debug, thiserror::Error)]
 #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
 pub struct UploadQueueNotReadyError;
+/// Behavioral modes that enable seamless live migration.
+///
+/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
+struct RemoteTimelineClientConfig {
+    /// If this is false, then update to remote_consistent_lsn are dropped rather
+    /// than being submitted to DeletionQueue for validation.  This behavior is
+    /// used when a tenant attachment is known to have a stale generation number,
+    /// such that validation attempts will always fail.  This is not necessary
+    /// for correctness, but avoids spamming error statistics with failed validations
+    /// when doing migrations of tenants.
+    process_remote_consistent_lsn_updates: bool,
+
+    /// If this is true, then object deletions are held in a buffer in RemoteTimelineClient
+    /// rather than being submitted to the DeletionQueue.  This behavior is used when a tenant
+    /// is known to be multi-attached, in order to avoid disrupting other attached tenants
+    /// whose generations' metadata refers to the deleted objects.
+    block_deletions: bool,
+}
+
+/// RemoteTimelineClientConfig's state is entirely driven by LocationConf, but we do
+/// not carry the entire LocationConf structure: it's much more than we need.  The From
+/// impl extracts the subset of the LocationConf that is interesting to RemoteTimelineClient.
+impl From<&AttachedLocationConfig> for RemoteTimelineClientConfig {
+    fn from(lc: &AttachedLocationConfig) -> Self {
+        Self {
+            block_deletions: !lc.may_delete_layers_hint(),
+            process_remote_consistent_lsn_updates: lc.may_upload_layers_hint(),
+        }
+    }
+}
 
 /// A client for accessing a timeline's data in remote storage.
 ///
@@ -322,7 +353,7 @@ pub struct UploadQueueNotReadyError;
 /// in the index part file, whenever timeline metadata is uploaded.
 ///
 /// Downloads are not queued, they are performed immediately.
-pub struct RemoteTimelineClient {
+pub(crate) struct RemoteTimelineClient {
     conf: &'static PageServerConf,
 
     runtime: tokio::runtime::Handle,
@@ -339,6 +370,9 @@ pub struct RemoteTimelineClient {
 
     deletion_queue_client: DeletionQueueClient,
 
+    /// Subset of tenant configuration used to control upload behaviors during migrations
+    config: std::sync::RwLock<RemoteTimelineClientConfig>,
+
     cancel: CancellationToken,
 }
 
@@ -349,13 +383,14 @@ impl RemoteTimelineClient {
     /// Note: the caller must initialize the upload queue before any uploads can be scheduled,
     /// by calling init_upload_queue.
     ///
-    pub fn new(
+    pub(crate) fn new(
         remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         generation: Generation,
+        location_conf: &AttachedLocationConfig,
     ) -> RemoteTimelineClient {
         RemoteTimelineClient {
             conf,
@@ -375,6 +410,7 @@ impl RemoteTimelineClient {
                 &tenant_shard_id,
                 &timeline_id,
             )),
+            config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
             cancel: CancellationToken::new(),
         }
     }
@@ -430,6 +466,43 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Notify this client of a change to its parent tenant's config, as this may cause us to
+    /// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle)
+    pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) {
+        let new_conf = RemoteTimelineClientConfig::from(location_conf);
+        let unblocked = !new_conf.block_deletions;
+
+        // Update config before draining deletions, so that we don't race with more being
+        // inserted.  This can result in deletions happening our of order, but that does not
+        // violate any invariants: deletions only need to be ordered relative to upload of the index
+        // that dereferences the deleted objects, and we are not changing that order.
+        *self.config.write().unwrap() = new_conf;
+
+        if unblocked {
+            // If we may now delete layers, drain any that were blocked in our old
+            // configuration state
+            let mut queue_locked = self.upload_queue.lock().unwrap();
+
+            if let Ok(queue) = queue_locked.initialized_mut() {
+                let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
+                for d in blocked_deletions {
+                    if let Err(e) = self.deletion_queue_client.push_layers_sync(
+                        self.tenant_shard_id,
+                        self.timeline_id,
+                        self.generation,
+                        d.layers,
+                    ) {
+                        // This could happen if the pageserver is shut down while a tenant
+                        // is transitioning from a deletion-blocked state: we will leak some
+                        // S3 objects in this case.
+                        warn!("Failed to drain blocked deletions: {}", e);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
     /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
     pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         match &mut *self.upload_queue.lock().unwrap() {
@@ -1913,16 +1986,24 @@ impl RemoteTimelineClient {
                     res
                 }
                 UploadOp::Delete(delete) => {
-                    pausable_failpoint!("before-delete-layer-pausable");
-                    self.deletion_queue_client
-                        .push_layers(
-                            self.tenant_shard_id,
-                            self.timeline_id,
-                            self.generation,
-                            delete.layers.clone(),
-                        )
-                        .await
-                        .map_err(|e| anyhow::anyhow!(e))
+                    if self.config.read().unwrap().block_deletions {
+                        let mut queue_locked = self.upload_queue.lock().unwrap();
+                        if let Ok(queue) = queue_locked.initialized_mut() {
+                            queue.blocked_deletions.push(delete.clone());
+                        }
+                        Ok(())
+                    } else {
+                        pausable_failpoint!("before-delete-layer-pausable");
+                        self.deletion_queue_client
+                            .push_layers(
+                                self.tenant_shard_id,
+                                self.timeline_id,
+                                self.generation,
+                                delete.layers.clone(),
+                            )
+                            .await
+                            .map_err(|e| anyhow::anyhow!(e))
+                    }
                 }
                 unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
                     // unreachable. Barrier operations are handled synchronously in
@@ -2029,8 +2110,16 @@ impl RemoteTimelineClient {
                         // Legacy mode: skip validating generation
                         upload_queue.visible_remote_consistent_lsn.store(lsn);
                         None
-                    } else {
+                    } else if self
+                        .config
+                        .read()
+                        .unwrap()
+                        .process_remote_consistent_lsn_updates
+                    {
                         Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+                    } else {
+                        // Our config disables remote_consistent_lsn updates: drop it.
+                        None
                     }
                 }
                 UploadOp::Delete(_) => {
@@ -2167,6 +2256,7 @@ impl RemoteTimelineClient {
                         queued_operations: VecDeque::default(),
                         #[cfg(feature = "testing")]
                         dangling_files: HashMap::default(),
+                        blocked_deletions: Vec::new(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
                     };
@@ -2402,6 +2492,7 @@ mod tests {
     use crate::{
         context::RequestContext,
         tenant::{
+            config::AttachmentMode,
             harness::{TenantHarness, TIMELINE_ID},
             storage_layer::layer::local_layer_path,
             Tenant, Timeline,
@@ -2487,6 +2578,10 @@ mod tests {
 
         /// Construct a RemoteTimelineClient in an arbitrary generation
         fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
+            let location_conf = AttachedLocationConfig {
+                generation,
+                attach_mode: AttachmentMode::Single,
+            };
             Arc::new(RemoteTimelineClient {
                 conf: self.harness.conf,
                 runtime: tokio::runtime::Handle::current(),
@@ -2500,6 +2595,7 @@ mod tests {
                     &self.harness.tenant_shard_id,
                     &TIMELINE_ID,
                 )),
+                config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
                 cancel: CancellationToken::new(),
             })
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a4289a222f..95864af4d0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -273,7 +273,7 @@ pub struct Timeline {
 
     /// Remote storage client.
     /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
-    pub remote_client: Arc<RemoteTimelineClient>,
+    pub(crate) remote_client: Arc<RemoteTimelineClient>,
 
     // What page versions do we hold in the repository? If we get a
     // request > last_record_lsn, we need to wait until we receive all
@@ -2172,14 +2172,14 @@ impl Timeline {
             )
     }
 
-    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+    pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
 
         // The threshold is embedded in the metric. So, we need to update it.
         {
             let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                new_conf,
+                &new_conf.tenant_conf,
                 &self.conf.default_tenant_conf,
             );
 
@@ -2187,6 +2187,9 @@ impl Timeline {
             let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
 
             let timeline_id_str = self.timeline_id.to_string();
+
+            self.remote_client.update_config(&new_conf.location);
+
             self.metrics
                 .evictions_with_low_residence_duration
                 .write()
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 13a8dfa51a..67fc710c44 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -283,7 +283,7 @@ impl DeleteTimelineFlow {
 
     /// Shortcut to create Timeline in stopping state and spawn deletion task.
     #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn resume_deletion(
+    pub(crate) async fn resume_deletion(
         tenant: Arc<Tenant>,
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 592f41cb21..f14bf2f8c3 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -88,6 +88,9 @@ pub(crate) struct UploadQueueInitialized {
     #[cfg(feature = "testing")]
     pub(crate) dangling_files: HashMap<LayerName, Generation>,
 
+    /// Deletions that are blocked by the tenant configuration
+    pub(crate) blocked_deletions: Vec<Delete>,
+
     /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
     pub(crate) shutting_down: bool,
 
@@ -180,6 +183,7 @@ impl UploadQueue {
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
             dangling_files: HashMap::new(),
+            blocked_deletions: Vec::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
         };
@@ -220,6 +224,7 @@ impl UploadQueue {
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
             dangling_files: HashMap::new(),
+            blocked_deletions: Vec::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
         };
@@ -270,7 +275,7 @@ pub(crate) struct UploadTask {
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct Delete {
     pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
 }
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index d4aef96735..12134048e6 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -365,6 +365,19 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     workload.validate(pageserver_a.id)
     workload.validate(pageserver_b.id)
 
+    # Force compaction on destination pageserver
+    pageserver_b.http_client().timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
+
+    # Destination pageserver is in AttachedMulti, it should have generated deletions but
+    # not enqueued them yet.
+    # Check deletion metrics via prometheus - should be 0 since we're in AttachedMulti
+    assert (
+        pageserver_b.http_client().get_metric_value(
+            "pageserver_deletion_queue_submitted_total",
+        )
+        == 0
+    )
+
     # Revert the origin to secondary
     log.info("Setting origin to Secondary")
     pageserver_a.tenant_location_configure(
@@ -389,6 +402,17 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         },
     )
 
+    # Transition to AttachedSingle should have drained deletions generated by doing a compaction
+    # while in AttachedMulti.
+    def blocked_deletions_drained():
+        submitted = pageserver_b.http_client().get_metric_value(
+            "pageserver_deletion_queue_submitted_total"
+        )
+        assert submitted is not None
+        assert submitted > 0
+
+    wait_until(10, 0.1, blocked_deletions_drained)
+
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
     del workload

From f36f0068b83bd536d33c49b238d964dcd96c9479 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 20 Nov 2024 17:50:39 +0000
Subject: [PATCH 14/51] chore(proxy): demote more logs during successful
 connection attempts (#9828)

Follow up to #9803

See https://github.com/neondatabase/cloud/issues/14378

In collaboration with @cloneable and @awarus, we sifted through logs and
simply demoted some logs to debug. This is not at all finished and there
are more logs to review, but we ran out of time in the session we
organised. In any slightly more nuanced cases, we didn't touch the log,
instead leaving a TODO comment.

I've also slightly refactored the sql-over-http body read/length reject
code. I can split that into a separate PR. It just felt natural after I
switched to `read_body_with_limit` as we discussed during the meet.
---
 libs/pq_proto/src/lib.rs                      |  1 +
 proxy/src/bin/local_proxy.rs                  |  2 +-
 proxy/src/bin/proxy.rs                        |  2 +-
 proxy/src/config.rs                           |  2 +-
 proxy/src/control_plane/client/neon.rs        |  1 +
 proxy/src/http/mod.rs                         | 10 +--
 proxy/src/proxy/connect_compute.rs            | 15 +++--
 proxy/src/proxy/copy_bidirectional.rs         |  2 +
 proxy/src/proxy/handshake.rs                  | 10 ++-
 proxy/src/proxy/mod.rs                        |  2 +-
 proxy/src/proxy/passthrough.rs                |  4 +-
 proxy/src/proxy/wake_compute.rs               |  7 +-
 proxy/src/rate_limiter/limit_algorithm.rs     |  6 +-
 .../src/rate_limiter/limit_algorithm/aimd.rs  | 24 ++++---
 proxy/src/redis/cancellation_publisher.rs     |  3 +-
 proxy/src/serverless/backend.rs               |  8 +--
 proxy/src/serverless/conn_pool.rs             |  2 +-
 proxy/src/serverless/conn_pool_lib.rs         |  4 +-
 proxy/src/serverless/http_conn_pool.rs        |  2 +-
 proxy/src/serverless/local_conn_pool.rs       |  4 +-
 proxy/src/serverless/sql_over_http.rs         | 66 +++++++++++--------
 21 files changed, 104 insertions(+), 73 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 9ffaaba584..b9e5387d86 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -184,6 +184,7 @@ pub struct CancelKeyData {
 
 impl fmt::Display for CancelKeyData {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // TODO: this is producing strange results, with 0xffffffff........ always in the logs.
         let hi = (self.backend_pid as u64) << 32;
         let lo = self.cancel_key as u64;
         let id = hi | lo;
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index c4ec1300f2..968682cf0f 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -111,7 +111,7 @@ struct SqlOverHttpArgs {
     sql_over_http_cancel_set_shards: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: u64,
+    sql_over_http_max_request_size_bytes: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     sql_over_http_max_response_size_bytes: usize,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 232721338d..45fbe4a398 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -276,7 +276,7 @@ struct SqlOverHttpArgs {
     sql_over_http_cancel_set_shards: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: u64,
+    sql_over_http_max_request_size_bytes: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     sql_over_http_max_response_size_bytes: usize,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index b048c9d389..8bc8e3f96f 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -64,7 +64,7 @@ pub struct HttpConfig {
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
-    pub max_request_size_bytes: u64,
+    pub max_request_size_bytes: usize,
     pub max_response_size_bytes: usize,
 }
 
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 53f9234926..757ea6720a 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -380,6 +380,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
         // after getting back a permit - it's possible the cache was filled
         // double check
         if permit.should_check_cache() {
+            // TODO: if there is something in the cache, mark the permit as success.
             check_cache!();
         }
 
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index b1642cedb3..ed88c77256 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -122,18 +122,18 @@ impl Endpoint {
 }
 
 #[derive(Error, Debug)]
-pub(crate) enum ReadBodyError {
+pub(crate) enum ReadBodyError<E> {
     #[error("Content length exceeds limit of {limit} bytes")]
     BodyTooLarge { limit: usize },
 
     #[error(transparent)]
-    Read(#[from] reqwest::Error),
+    Read(#[from] E),
 }
 
-pub(crate) async fn read_body_with_limit(
-    mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
+pub(crate) async fn read_body_with_limit<E>(
+    mut b: impl Body<Data = Bytes, Error = E> + Unpin,
     limit: usize,
-) -> Result<Vec<u8>, ReadBodyError> {
+) -> Result<Vec<u8>, ReadBodyError<E>> {
     // We could use `b.limited().collect().await.to_bytes()` here
     // but this ends up being slightly more efficient as far as I can tell.
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b30aec09c1..2e759b0894 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -117,7 +117,6 @@ where
     node_info.set_keys(user_info.get_keys());
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     mechanism.update_connect_config(&mut node_info.config);
-    let retry_type = RetryType::ConnectToCompute;
 
     // try once
     let err = match mechanism
@@ -129,7 +128,7 @@ where
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Success,
-                    retry_type,
+                    retry_type: RetryType::ConnectToCompute,
                 },
                 num_retries.into(),
             );
@@ -147,7 +146,7 @@ where
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Failed,
-                    retry_type,
+                    retry_type: RetryType::ConnectToCompute,
                 },
                 num_retries.into(),
             );
@@ -156,8 +155,9 @@ where
         node_info
     } else {
         // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-        info!("compute node's state has likely changed; requesting a wake-up");
+        debug!("compute node's state has likely changed; requesting a wake-up");
         let old_node_info = invalidate_cache(node_info);
+        // TODO: increment num_retries?
         let mut node_info =
             wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
         node_info.reuse_settings(old_node_info);
@@ -169,7 +169,7 @@ where
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
     // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
+    debug!("wake_compute success. attempting to connect");
     num_retries = 1;
     loop {
         match mechanism
@@ -181,10 +181,11 @@ where
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
-                        retry_type,
+                        retry_type: RetryType::ConnectToCompute,
                     },
                     num_retries.into(),
                 );
+                // TODO: is this necessary? We have a metric.
                 info!(?num_retries, "connected to compute node after");
                 return Ok(res);
             }
@@ -194,7 +195,7 @@ where
                     Metrics::get().proxy.retries_metric.observe(
                         RetriesMetricGroup {
                             outcome: ConnectOutcome::Failed,
-                            retry_type,
+                            retry_type: RetryType::ConnectToCompute,
                         },
                         num_retries.into(),
                     );
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 91a3ceff75..4e4af88634 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -87,6 +87,8 @@ where
             transfer_one_direction(cx, &mut compute_to_client, compute, client)
                 .map_err(ErrorSource::from_compute)?;
 
+        // TODO: 1 info log, with a enum label for close direction.
+
         // Early termination checks from compute to client.
         if let TransferState::Done(_) = compute_to_client {
             if let TransferState::Running(buf) = &client_to_compute {
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 3ada3a9995..e27c211932 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -5,7 +5,7 @@ use pq_proto::{
 };
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 
 use crate::auth::endpoint_sni;
 use crate::config::{TlsConfig, PG_ALPN_PROTOCOL};
@@ -199,6 +199,8 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
+                // This log highlights the start of the connection.
+                // This contains useful information for debugging, not logged elsewhere, like role name and endpoint id.
                 info!(
                     ?version,
                     ?params,
@@ -211,7 +213,7 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
             FeStartupPacket::StartupMessage { params, version }
                 if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
             {
-                warn!(?version, "unsupported minor version");
+                debug!(?version, "unsupported minor version");
 
                 // no protocol extensions are supported.
                 // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
@@ -233,14 +235,16 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                 info!(
                     ?version,
+                    ?params,
                     session_type = "normal",
                     "successful handshake; unsupported minor version requested"
                 );
                 break Ok(HandshakeData::Startup(stream, params));
             }
-            FeStartupPacket::StartupMessage { version, .. } => {
+            FeStartupPacket::StartupMessage { version, params } => {
                 warn!(
                     ?version,
+                    ?params,
                     session_type = "normal",
                     "unsuccessful handshake; unsupported version"
                 );
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 4be4006d15..9415b54a4a 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -254,7 +254,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
-    info!(
+    debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index e3b4730982..5e07c8eeae 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,5 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::debug;
 use utils::measured_stream::MeasuredStream;
 
 use super::copy_bidirectional::ErrorSource;
@@ -45,7 +45,7 @@ pub(crate) async fn proxy_pass(
     );
 
     // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
+    debug!("performing the proxy pass...");
     let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute(
         &mut client,
         &mut compute,
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index d09e0b1f41..8a672d48dc 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -17,7 +17,6 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     api: &B,
     config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
-    let retry_type = RetryType::WakeCompute;
     loop {
         match api.wake_compute(ctx).await {
             Err(e) if !should_retry(&e, *num_retries, config) => {
@@ -26,7 +25,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Failed,
-                        retry_type,
+                        retry_type: RetryType::WakeCompute,
                     },
                     (*num_retries).into(),
                 );
@@ -40,10 +39,12 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
-                        retry_type,
+                        retry_type: RetryType::WakeCompute,
                     },
                     (*num_retries).into(),
                 );
+                // TODO: is this necessary? We have a metric.
+                // TODO: this log line is misleading as "wake_compute" might return cached (and stale) info.
                 info!(?num_retries, "compute node woken up after");
                 return Ok(n);
             }
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 16c398f303..b74a9ab17e 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -195,7 +195,11 @@ impl DynamicLimiter {
     ///
     /// Set the outcome to `None` to ignore the job.
     fn release_inner(&self, start: Instant, outcome: Option<Outcome>) {
-        tracing::info!("outcome is {:?}", outcome);
+        if outcome.is_none() {
+            tracing::warn!("outcome is {:?}", outcome);
+        } else {
+            tracing::debug!("outcome is {:?}", outcome);
+        }
         if self.config.initial_limit == 0 {
             return;
         }
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 5332a5184f..3000cc4c2a 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -31,26 +31,32 @@ impl LimitAlgorithm for Aimd {
 
                 if utilisation > self.utilisation {
                     let limit = old_limit + self.inc;
-                    let increased_limit = limit.clamp(self.min, self.max);
-                    if increased_limit > old_limit {
-                        tracing::info!(increased_limit, "limit increased");
+                    let new_limit = limit.clamp(self.min, self.max);
+                    if new_limit > old_limit {
+                        tracing::info!(old_limit, new_limit, "limit increased");
+                    } else {
+                        tracing::debug!(old_limit, new_limit, "limit clamped at max");
                     }
 
-                    increased_limit
+                    new_limit
                 } else {
                     old_limit
                 }
             }
             Outcome::Overload => {
-                let limit = old_limit as f32 * self.dec;
+                let new_limit = old_limit as f32 * self.dec;
 
                 // Floor instead of round, so the limit reduces even with small numbers.
                 // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
-                let limit = limit.floor() as usize;
+                let new_limit = new_limit.floor() as usize;
 
-                let limit = limit.clamp(self.min, self.max);
-                tracing::info!(limit, "limit decreased");
-                limit
+                let new_limit = new_limit.clamp(self.min, self.max);
+                if new_limit < old_limit {
+                    tracing::info!(old_limit, new_limit, "limit decreased");
+                } else {
+                    tracing::debug!(old_limit, new_limit, "limit clamped at min");
+                }
+                new_limit
             }
         }
     }
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 0000246971..7392b0d316 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -121,6 +121,7 @@ impl RedisPublisherClient {
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
     ) -> anyhow::Result<()> {
+        // TODO: review redundant error duplication logs.
         if !self.limiter.check() {
             tracing::info!("Rate limit exceeded. Skipping cancellation message");
             return Err(anyhow::anyhow!("Rate limit exceeded"));
@@ -146,7 +147,7 @@ impl CancellationPublisherMut for RedisPublisherClient {
         tracing::info!("publishing cancellation key to Redis");
         match self.try_publish_internal(cancel_key_data, session_id).await {
             Ok(()) => {
-                tracing::info!("cancellation key successfuly published to Redis");
+                tracing::debug!("cancellation key successfuly published to Redis");
                 Ok(())
             }
             Err(e) => {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index d9dcf6fbb7..7df978f84c 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -167,10 +167,10 @@ impl PoolingBackend {
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if force_new {
-            info!("pool: pool is disabled");
+            debug!("pool: pool is disabled");
             None
         } else {
-            info!("pool: looking for an existing connection");
+            debug!("pool: looking for an existing connection");
             self.pool.get(ctx, &conn_info)?
         };
 
@@ -204,14 +204,14 @@ impl PoolingBackend {
         ctx: &RequestContext,
         conn_info: ConnInfo,
     ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
-        info!("pool: looking for an existing connection");
+        debug!("pool: looking for an existing connection");
         if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) {
             return Ok(client);
         }
 
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
-        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        debug!(%conn_id, "pool: opening a new connection '{conn_info}'");
         let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials {
             info: ComputeUserInfo {
                 user: conn_info.user_info.user.clone(),
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 07ba1ae9af..f716326a68 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -243,7 +243,7 @@ mod tests {
             },
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
-            max_request_size_bytes: u64::MAX,
+            max_request_size_bytes: usize::MAX,
             max_response_size_bytes: usize::MAX,
         }));
         let pool = GlobalConnPool::new(config);
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index fe3c422c3b..c5db025870 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -232,7 +232,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
 
         // do logging outside of the mutex
         if returned {
-            info!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+            debug!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
         } else {
             info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
         }
@@ -409,7 +409,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 "pid",
                 tracing::field::display(client.inner.get_process_id()),
             );
-            info!(
+            debug!(
                 cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                 "pool: reusing connection '{conn_info}'"
             );
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index bc86c4b1cd..e9455420c0 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -227,7 +227,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
         };
 
         tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
-        info!(
+        debug!(
             cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
             "pool: reusing connection '{conn_info}'"
         );
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index cadcbd7530..310af08221 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -29,7 +29,7 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
 use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};
 
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
@@ -110,7 +110,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
                 "pid",
                 tracing::field::display(client.inner.get_process_id()),
             );
-            info!(
+            debug!(
                 cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                 "local_pool: reusing connection '{conn_info}'"
             );
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 36d8595902..ab75086884 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -8,7 +8,7 @@ use http::header::AUTHORIZATION;
 use http::Method;
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Full};
-use hyper::body::{Body, Incoming};
+use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
 use hyper::{header, HeaderMap, Request, Response, StatusCode};
 use pq_proto::StartupMessageParamsBuilder;
@@ -18,7 +18,7 @@ use tokio::time;
 use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
 use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info};
+use tracing::{debug, error, info};
 use typed_json::json;
 use url::Url;
 use urlencoding;
@@ -36,6 +36,7 @@ use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::http::{read_body_with_limit, ReadBodyError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
@@ -357,8 +358,6 @@ pub(crate) enum SqlOverHttpError {
     ConnectCompute(#[from] HttpConnError),
     #[error("{0}")]
     ConnInfo(#[from] ConnInfoError),
-    #[error("request is too large (max is {0} bytes)")]
-    RequestTooLarge(u64),
     #[error("response is too large (max is {0} bytes)")]
     ResponseTooLarge(usize),
     #[error("invalid isolation level")]
@@ -377,7 +376,6 @@ impl ReportableError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
             SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
             SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
-            SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User,
             SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
             SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
             SqlOverHttpError::Postgres(p) => p.get_error_kind(),
@@ -393,7 +391,6 @@ impl UserFacingError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(p) => p.to_string(),
             SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
             SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
-            SqlOverHttpError::RequestTooLarge(_) => self.to_string(),
             SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
             SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
             SqlOverHttpError::Postgres(p) => p.to_string(),
@@ -406,13 +403,12 @@ impl UserFacingError for SqlOverHttpError {
 impl HttpCodeError for SqlOverHttpError {
     fn get_http_status_code(&self) -> StatusCode {
         match self {
-            SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::ReadPayload(e) => e.get_http_status_code(),
             SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() {
                 ErrorKind::User => StatusCode::BAD_REQUEST,
                 _ => StatusCode::INTERNAL_SERVER_ERROR,
             },
             SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST,
-            SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE,
             SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
             SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
             SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
@@ -426,19 +422,41 @@ impl HttpCodeError for SqlOverHttpError {
 pub(crate) enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
     Read(#[from] hyper::Error),
+    #[error("request is too large (max is {limit} bytes)")]
+    BodyTooLarge { limit: usize },
     #[error("could not parse the HTTP request body: {0}")]
     Parse(#[from] serde_json::Error),
 }
 
+impl From<ReadBodyError<hyper::Error>> for ReadPayloadError {
+    fn from(value: ReadBodyError<hyper::Error>) -> Self {
+        match value {
+            ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit },
+            ReadBodyError::Read(e) => Self::Read(e),
+        }
+    }
+}
+
 impl ReportableError for ReadPayloadError {
     fn get_error_kind(&self) -> ErrorKind {
         match self {
             ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect,
+            ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User,
             ReadPayloadError::Parse(_) => ErrorKind::User,
         }
     }
 }
 
+impl HttpCodeError for ReadPayloadError {
+    fn get_http_status_code(&self) -> StatusCode {
+        match self {
+            ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST,
+            ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE,
+            ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST,
+        }
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum SqlOverHttpCancel {
     #[error("query was cancelled")]
@@ -580,28 +598,20 @@ async fn handle_db_inner(
 
     let parsed_headers = HttpHeaders::try_parse(headers)?;
 
-    let request_content_length = match request.body().size_hint().upper() {
-        Some(v) => v,
-        None => config.http_config.max_request_size_bytes + 1,
-    };
-    info!(request_content_length, "request size in bytes");
-    Metrics::get()
-        .proxy
-        .http_conn_content_length_bytes
-        .observe(HttpDirection::Request, request_content_length as f64);
-
-    // we don't have a streaming request support yet so this is to prevent OOM
-    // from a malicious user sending an extremely large request body
-    if request_content_length > config.http_config.max_request_size_bytes {
-        return Err(SqlOverHttpError::RequestTooLarge(
-            config.http_config.max_request_size_bytes,
-        ));
-    }
-
     let fetch_and_process_request = Box::pin(
         async {
-            let body = request.into_body().collect().await?.to_bytes();
-            info!(length = body.len(), "request payload read");
+            let body = read_body_with_limit(
+                request.into_body(),
+                config.http_config.max_request_size_bytes,
+            )
+            .await?;
+
+            Metrics::get()
+                .proxy
+                .http_conn_content_length_bytes
+                .observe(HttpDirection::Request, body.len() as f64);
+
+            debug!(length = body.len(), "request payload read");
             let payload: Payload = serde_json::from_slice(&body)?;
             Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
         }

From ee26f09e45e72eab940e7721ba9f0a674e84b827 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 20 Nov 2024 18:33:05 +0000
Subject: [PATCH 15/51] pageserver: remove shard split hard link assertion
 (#9829)

## Problem

We were hitting this assertion in debug mode tests sometimes.

This case was being hit when the parent shard has no resident layers.
For instance, this is the case on split retry where the previous attempt
shut-down the parent and deleted local state for it. If the logical size
calculation does not download some layers before we get to the
hardlinking, then the assertion is hit.

## Summary of Changes

Remove the assertion. It's fine for the ancestor to not have any
resident layers at the time of the split.

Closes https://github.com/neondatabase/neon/issues/9412
---
 pageserver/src/tenant/mgr.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4fc9d740c8..92b2200542 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1719,10 +1719,11 @@ impl TenantManager {
                     parent_layers.push(relative_path.to_owned());
                 }
             }
-            debug_assert!(
-                !parent_layers.is_empty(),
-                "shutdown cannot empty the layermap"
-            );
+
+            if parent_layers.is_empty() {
+                tracing::info!("Ancestor shard has no resident layer to hard link");
+            }
+
             (parent_timelines, parent_layers)
         };
 

From 811fab136fc82aa8b5c85f93dc00d19851c07387 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:31:02 +0100
Subject: [PATCH 16/51] scrubber: allow restricting find_garbage to a partial
 tenant id prefix (#9814)

Adds support to the `find_garbage` command to restrict itself to a
partial tenant ID prefix, say `a`, and then it only traverses tenants
with IDs starting with `a`. One can now pass the `--tenant-id-prefix`
parameter.

That way, one can shard the `find_garbage` command and make it run in
parallel.

The PR also does a change of how `remote_storage` first removes trailing
`/`s, only to then add them in the listing function. It turns out that
this isn't neccessary and it prevents the prefix functionality from
working. S3 doesn't do this either.
---
 libs/remote_storage/src/azure_blob.rs   | 24 ++++++++----------------
 storage_scrubber/src/garbage.rs         | 15 ++++++++++++---
 storage_scrubber/src/main.rs            | 13 ++++++++++++-
 storage_scrubber/src/metadata_stream.rs | 13 ++++++++++++-
 4 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index f98d16789c..1c0d43d479 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -97,10 +97,7 @@ impl AzureBlobStorage {
 
     pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
         assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path.get_path().as_str();
         match &self.prefix_in_container {
             Some(prefix) => {
                 if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
@@ -277,19 +274,14 @@ impl RemoteStorage for AzureBlobStorage {
         cancel: &CancellationToken,
     ) -> impl Stream<Item = Result<Listing, DownloadError>> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| {
+            self.prefix_in_container.clone().map(|mut s| {
+                if !s.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                 }
-                p
-            });
+                s
+            })
+        });
 
         async_stream::stream! {
             let _permit = self.permit(RequestKind::List, cancel).await?;
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 91668a42a7..b026efbc3b 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -21,7 +21,7 @@ use utils::{backoff, id::TenantId};
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
     init_remote, list_objects_with_retries,
-    metadata_stream::{stream_tenant_timelines, stream_tenants},
+    metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
 };
 
@@ -118,9 +118,17 @@ pub async fn find_garbage(
     console_config: ConsoleConfig,
     depth: TraversingDepth,
     node_kind: NodeKind,
+    tenant_id_prefix: Option<String>,
     output_path: String,
 ) -> anyhow::Result<()> {
-    let garbage = find_garbage_inner(bucket_config, console_config, depth, node_kind).await?;
+    let garbage = find_garbage_inner(
+        bucket_config,
+        console_config,
+        depth,
+        node_kind,
+        tenant_id_prefix,
+    )
+    .await?;
     let serialized = serde_json::to_vec_pretty(&garbage)?;
 
     tokio::fs::write(&output_path, &serialized).await?;
@@ -152,6 +160,7 @@ async fn find_garbage_inner(
     console_config: ConsoleConfig,
     depth: TraversingDepth,
     node_kind: NodeKind,
+    tenant_id_prefix: Option<String>,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
     let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
@@ -178,7 +187,7 @@ async fn find_garbage_inner(
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in {}...", bucket_config.desc_str());
-    let tenants = stream_tenants(&remote_client, &target);
+    let tenants = stream_tenants_maybe_prefix(&remote_client, &target, tenant_id_prefix);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
         let console_cache = console_cache.clone();
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 0ffb570984..92979d609e 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -54,6 +54,8 @@ enum Command {
         node_kind: NodeKind,
         #[arg(short, long, default_value_t=TraversingDepth::Tenant)]
         depth: TraversingDepth,
+        #[arg(short, long, default_value=None)]
+        tenant_id_prefix: Option<String>,
         #[arg(short, long, default_value_t = String::from("garbage.json"))]
         output_path: String,
     },
@@ -209,10 +211,19 @@ async fn main() -> anyhow::Result<()> {
         Command::FindGarbage {
             node_kind,
             depth,
+            tenant_id_prefix,
             output_path,
         } => {
             let console_config = ConsoleConfig::from_env()?;
-            find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
+            find_garbage(
+                bucket_config,
+                console_config,
+                depth,
+                node_kind,
+                tenant_id_prefix,
+                output_path,
+            )
+            .await
         }
         Command::PurgeGarbage {
             input_path,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index efda7c213d..47447d681c 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -17,9 +17,20 @@ use utils::id::{TenantId, TimelineId};
 pub fn stream_tenants<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
+) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
+    stream_tenants_maybe_prefix(remote_client, target, None)
+}
+/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
+pub fn stream_tenants_maybe_prefix<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a RootTarget,
+    tenant_id_prefix: Option<String>,
 ) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
     try_stream! {
-        let tenants_target = target.tenants_root();
+        let mut tenants_target = target.tenants_root();
+        if let Some(tenant_id_prefix) = tenant_id_prefix {
+            tenants_target.prefix_in_bucket += &tenant_id_prefix;
+        }
         let mut tenants_stream =
             std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
         while let Some(chunk) = tenants_stream.next().await {

From 313ebfdb88b7ef5d2f75d4d4c3ccacd7250fe861 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:36:23 +0100
Subject: [PATCH 17/51] [proxy] chore: allow bypassing empty `params` to `/sql`
 endpoint (#9827)

## Problem

```
curl -H "Neon-Connection-String: postgresql://neondb_owner:PASSWORD@ep-autumn-rain-a58lubg0.us-east-2.aws.neon.tech/neondb?sslmode=require" https://ep-autumn-rain-a58lubg0.us-east-2.aws.neon.tech/sql -d '{"query":"SELECT 1","params":[]}'
```

For such a query, I also need to send `params`. Do I really need it?

## Summary of changes
I've marked `params` as optional
---
 proxy/src/serverless/sql_over_http.rs | 61 +++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index ab75086884..1b17495c5d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -48,6 +48,7 @@ use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 struct QueryData {
     query: String,
     #[serde(deserialize_with = "bytes_to_pg_text")]
+    #[serde(default)]
     params: Vec<Option<String>>,
     #[serde(default)]
     array_mode: Option<bool>,
@@ -1105,3 +1106,63 @@ impl Discard<'_> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_payload() {
+        let payload = "{\"query\":\"SELECT * FROM users WHERE name = ?\",\"params\":[\"test\"],\"arrayMode\":true}";
+        let deserialized_payload: Payload = serde_json::from_str(payload).unwrap();
+
+        match deserialized_payload {
+            Payload::Single(QueryData {
+                query,
+                params,
+                array_mode,
+            }) => {
+                assert_eq!(query, "SELECT * FROM users WHERE name = ?");
+                assert_eq!(params, vec![Some(String::from("test"))]);
+                assert!(array_mode.unwrap());
+            }
+            Payload::Batch(_) => {
+                panic!("deserialization failed: case with single query, one param, and array mode")
+            }
+        }
+
+        let payload = "{\"queries\":[{\"query\":\"SELECT * FROM users0 WHERE name = ?\",\"params\":[\"test0\"], \"arrayMode\":false},{\"query\":\"SELECT * FROM users1 WHERE name = ?\",\"params\":[\"test1\"],\"arrayMode\":true}]}";
+        let deserialized_payload: Payload = serde_json::from_str(payload).unwrap();
+
+        match deserialized_payload {
+            Payload::Batch(BatchQueryData { queries }) => {
+                assert_eq!(queries.len(), 2);
+                for (i, query) in queries.into_iter().enumerate() {
+                    assert_eq!(
+                        query.query,
+                        format!("SELECT * FROM users{i} WHERE name = ?")
+                    );
+                    assert_eq!(query.params, vec![Some(format!("test{i}"))]);
+                    assert_eq!(query.array_mode.unwrap(), i > 0);
+                }
+            }
+            Payload::Single(_) => panic!("deserialization failed: case with multiple queries"),
+        }
+
+        let payload = "{\"query\":\"SELECT 1\"}";
+        let deserialized_payload: Payload = serde_json::from_str(payload).unwrap();
+
+        match deserialized_payload {
+            Payload::Single(QueryData {
+                query,
+                params,
+                array_mode,
+            }) => {
+                assert_eq!(query, "SELECT 1");
+                assert_eq!(params, vec![]);
+                assert!(array_mode.is_none());
+            }
+            Payload::Batch(_) => panic!("deserialization failed: case with only one query"),
+        }
+    }
+}

From 2d6bf176a0698258d17def3011aedcc836a5427f Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Wed, 20 Nov 2024 21:36:29 +0200
Subject: [PATCH 18/51] proxy: Refactor http conn pool (#9785)

- Use the same ConnPoolEntry for http connection pool.
- Rename EndpointConnPool to the HttpConnPool.
- Narrow clone bound for client

Fixes #9284
---
 proxy/src/serverless/backend.rs         |  13 +-
 proxy/src/serverless/conn_pool.rs       |   6 +-
 proxy/src/serverless/conn_pool_lib.rs   | 201 ++++++++++++++----------
 proxy/src/serverless/http_conn_pool.rs  | 201 +++++++-----------------
 proxy/src/serverless/local_conn_pool.rs |   1 +
 proxy/src/serverless/mod.rs             |   2 +-
 proxy/src/serverless/sql_over_http.rs   |   1 +
 7 files changed, 187 insertions(+), 238 deletions(-)

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 7df978f84c..3037e20888 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -12,8 +12,8 @@ use tracing::field::display;
 use tracing::{debug, info};
 
 use super::conn_pool::poll_client;
-use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
-use super::http_conn_pool::{self, poll_http2_client, Send};
+use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool};
+use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send};
 use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
@@ -36,9 +36,10 @@ use crate::rate_limiter::EndpointRateLimiter;
 use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
 
 pub(crate) struct PoolingBackend {
-    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
+    pub(crate) http_conn_pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
-    pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pub(crate) pool:
+        Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
 
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -474,7 +475,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError {
 }
 
 struct TokioMechanism {
-    pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pool: Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -524,7 +525,7 @@ impl ConnectMechanism for TokioMechanism {
 }
 
 struct HyperMechanism {
-    pool: Arc<http_conn_pool::GlobalConnPool<Send>>,
+    pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f716326a68..bd262f45ed 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -19,7 +19,8 @@ use {
 };
 
 use super::conn_pool_lib::{
-    Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool,
+    Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, EndpointConnPool,
+    GlobalConnPool,
 };
 use crate::context::RequestContext;
 use crate::control_plane::messages::MetricsAuxInfo;
@@ -52,7 +53,7 @@ impl fmt::Display for ConnInfo {
 }
 
 pub(crate) fn poll_client<C: ClientInnerExt>(
-    global_pool: Arc<GlobalConnPool<C>>,
+    global_pool: Arc<GlobalConnPool<C, EndpointConnPool<C>>>,
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
@@ -167,6 +168,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     Client::new(inner, conn_info, pool_clone)
 }
 
+#[derive(Clone)]
 pub(crate) struct ClientDataRemote {
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index c5db025870..fe1d2563bc 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::marker::PhantomData;
 use std::ops::Deref;
 use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
@@ -43,13 +44,14 @@ impl ConnInfo {
     }
 }
 
+#[derive(Clone)]
 pub(crate) enum ClientDataEnum {
     Remote(ClientDataRemote),
     Local(ClientDataLocal),
-    #[allow(dead_code)]
     Http(ClientDataHttp),
 }
 
+#[derive(Clone)]
 pub(crate) struct ClientInnerCommon<C: ClientInnerExt> {
     pub(crate) inner: C,
     pub(crate) aux: MetricsAuxInfo,
@@ -91,6 +93,7 @@ pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
 pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
     pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
+    /// max # connections per endpoint
     max_conns: usize,
     _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
@@ -317,24 +320,49 @@ impl<C: ClientInnerExt> DbUserConn<C> for DbUserConnPool<C> {
     }
 }
 
-pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
+pub(crate) trait EndpointConnPoolExt<C: ClientInnerExt> {
+    fn clear_closed(&mut self) -> usize;
+    fn total_conns(&self) -> usize;
+}
+
+impl<C: ClientInnerExt> EndpointConnPoolExt<C> for EndpointConnPool<C> {
+    fn clear_closed(&mut self) -> usize {
+        let mut clients_removed: usize = 0;
+        for db_pool in self.pools.values_mut() {
+            clients_removed += db_pool.clear_closed_clients(&mut self.total_conns);
+        }
+        clients_removed
+    }
+
+    fn total_conns(&self) -> usize {
+        self.total_conns
+    }
+}
+
+pub(crate) struct GlobalConnPool<C, P>
+where
+    C: ClientInnerExt,
+    P: EndpointConnPoolExt<C>,
+{
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
+    pub(crate) global_pool: DashMap<EndpointCacheKey, Arc<RwLock<P>>>,
 
     /// Number of endpoint-connection pools
     ///
     /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
+    pub(crate) global_pool_size: AtomicUsize,
 
     /// Total number of connections in the pool
-    global_connections_count: Arc<AtomicUsize>,
+    pub(crate) global_connections_count: Arc<AtomicUsize>,
 
-    config: &'static crate::config::HttpConfig,
+    pub(crate) config: &'static crate::config::HttpConfig,
+
+    _marker: PhantomData<C>,
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -357,7 +385,11 @@ pub struct GlobalConnPoolOptions {
     pub max_total_conns: usize,
 }
 
-impl<C: ClientInnerExt> GlobalConnPool<C> {
+impl<C, P> GlobalConnPool<C, P>
+where
+    C: ClientInnerExt,
+    P: EndpointConnPoolExt<C>,
+{
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
@@ -365,6 +397,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             global_pool_size: AtomicUsize::new(0),
             config,
             global_connections_count: Arc::new(AtomicUsize::new(0)),
+            _marker: PhantomData,
         })
     }
 
@@ -378,6 +411,80 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         self.config.pool_options.idle_timeout
     }
 
+    pub(crate) fn shutdown(&self) {
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
+
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
+
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    pub(crate) fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
+        let current_len = shard.len();
+        let mut clients_removed = 0;
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let endpoints = pool.get_mut();
+                clients_removed = endpoints.clear_closed();
+
+                if endpoints.total_conns() == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
+        });
+
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe();
+
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
+    }
+}
+
+impl<C: ClientInnerExt> GlobalConnPool<C, EndpointConnPool<C>> {
     pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestContext,
@@ -432,85 +539,6 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         Ok(None)
     }
 
-    pub(crate) fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
-
-    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-
-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    pub(crate) fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = Metrics::get()
-            .proxy
-            .http_pool_reclaimation_lag_seconds
-            .start_timer();
-        let current_len = shard.len();
-        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool {
-                    pools, total_conns, ..
-                } = pool.get_mut();
-
-                // ensure that closed clients are removed
-                for db_pool in pools.values_mut() {
-                    clients_removed += db_pool.clear_closed_clients(total_conns);
-                }
-
-                // we only remove this pool if it has no active connections
-                if *total_conns == 0 {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
-        });
-
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe();
-
-        // Do logging outside of the lock.
-        if clients_removed > 0 {
-            let size = self
-                .global_connections_count
-                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
-                - clients_removed;
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
-        }
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
-    }
-
     pub(crate) fn get_or_create_endpoint_pool(
         self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
@@ -556,7 +584,6 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         pool
     }
 }
-
 pub(crate) struct Client<C: ClientInnerExt> {
     span: Span,
     inner: Option<ClientInnerCommon<C>>,
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index e9455420c0..fde38d0de3 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -2,16 +2,17 @@ use std::collections::VecDeque;
 use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 
-use dashmap::DashMap;
 use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
-use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
 use super::backend::HttpConnError;
-use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
+use super::conn_pool_lib::{
+    ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry,
+    EndpointConnPoolExt, GlobalConnPool,
+};
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
@@ -23,17 +24,11 @@ pub(crate) type Connect =
     http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
-pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
-    conn: C,
-    conn_id: uuid::Uuid,
-    aux: MetricsAuxInfo,
-}
-
 pub(crate) struct ClientDataHttp();
 
 // Per-endpoint connection pool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
+pub(crate) struct HttpConnPool<C: ClientInnerExt + Clone> {
     // TODO(conrad):
     // either we should open more connections depending on stream count
     // (not exposed by hyper, need our own counter)
@@ -48,14 +43,19 @@ pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
     global_connections_count: Arc<AtomicUsize>,
 }
 
-impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
+impl<C: ClientInnerExt + Clone> HttpConnPool<C> {
     fn get_conn_entry(&mut self) -> Option<ConnPoolEntry<C>> {
         let Self { conns, .. } = self;
 
         loop {
             let conn = conns.pop_front()?;
-            if !conn.conn.is_closed() {
-                conns.push_back(conn.clone());
+            if !conn.conn.inner.is_closed() {
+                let new_conn = ConnPoolEntry {
+                    conn: conn.conn.clone(),
+                    _last_access: std::time::Instant::now(),
+                };
+
+                conns.push_back(new_conn);
                 return Some(conn);
             }
         }
@@ -69,7 +69,7 @@ impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
         } = self;
 
         let old_len = conns.len();
-        conns.retain(|conn| conn.conn_id != conn_id);
+        conns.retain(|entry| entry.conn.conn_id != conn_id);
         let new_len = conns.len();
         let removed = old_len - new_len;
         if removed > 0 {
@@ -84,7 +84,22 @@ impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
     }
 }
 
-impl<C: ClientInnerExt + Clone> Drop for EndpointConnPool<C> {
+impl<C: ClientInnerExt + Clone> EndpointConnPoolExt<C> for HttpConnPool<C> {
+    fn clear_closed(&mut self) -> usize {
+        let Self { conns, .. } = self;
+        let old_len = conns.len();
+        conns.retain(|entry| !entry.conn.inner.is_closed());
+
+        let new_len = conns.len();
+        old_len - new_len
+    }
+
+    fn total_conns(&self) -> usize {
+        self.conns.len()
+    }
+}
+
+impl<C: ClientInnerExt + Clone> Drop for HttpConnPool<C> {
     fn drop(&mut self) {
         if !self.conns.is_empty() {
             self.global_connections_count
@@ -98,117 +113,7 @@ impl<C: ClientInnerExt + Clone> Drop for EndpointConnPool<C> {
     }
 }
 
-pub(crate) struct GlobalConnPool<C: ClientInnerExt + Clone> {
-    // endpoint -> per-endpoint connection pool
-    //
-    // That should be a fairly conteded map, so return reference to the per-endpoint
-    // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
-
-    /// Number of endpoint-connection pools
-    ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
-    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
-    /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
-
-    /// Total number of connections in the pool
-    global_connections_count: Arc<AtomicUsize>,
-
-    config: &'static crate::config::HttpConfig,
-}
-
-impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
-    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
-        let shards = config.pool_options.pool_shards;
-        Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
-            global_pool_size: AtomicUsize::new(0),
-            config,
-            global_connections_count: Arc::new(AtomicUsize::new(0)),
-        })
-    }
-
-    pub(crate) fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
-
-    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-
-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = Metrics::get()
-            .proxy
-            .http_pool_reclaimation_lag_seconds
-            .start_timer();
-        let current_len = shard.len();
-        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool { conns, .. } = pool.get_mut();
-
-                let old_len = conns.len();
-
-                conns.retain(|conn| !conn.conn.is_closed());
-
-                let new_len = conns.len();
-                let removed = old_len - new_len;
-                clients_removed += removed;
-
-                // we only remove this pool if it has no active connections
-                if conns.is_empty() {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
-        });
-
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe();
-
-        // Do logging outside of the lock.
-        if clients_removed > 0 {
-            let size = self
-                .global_connections_count
-                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
-                - clients_removed;
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
-        }
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
-    }
-
+impl<C: ClientInnerExt + Clone> GlobalConnPool<C, HttpConnPool<C>> {
     #[expect(unused_results)]
     pub(crate) fn get(
         self: &Arc<Self>,
@@ -226,27 +131,28 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
             return result;
         };
 
-        tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
+        tracing::Span::current().record("conn_id", tracing::field::display(client.conn.conn_id));
         debug!(
             cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
             "pool: reusing connection '{conn_info}'"
         );
         ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
         ctx.success();
-        Ok(Some(Client::new(client.conn, client.aux)))
+
+        Ok(Some(Client::new(client.conn.clone())))
     }
 
     fn get_or_create_endpoint_pool(
         self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool<C>>> {
+    ) -> Arc<RwLock<HttpConnPool<C>>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
         }
 
         // slow path
-        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+        let new_pool = Arc::new(RwLock::new(HttpConnPool {
             conns: VecDeque::new(),
             _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
             global_connections_count: self.global_connections_count.clone(),
@@ -279,7 +185,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
 }
 
 pub(crate) fn poll_http2_client(
-    global_pool: Arc<GlobalConnPool<Send>>,
+    global_pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
     ctx: &RequestContext,
     conn_info: &ConnInfo,
     client: Send,
@@ -299,11 +205,15 @@ pub(crate) fn poll_http2_client(
     let pool = match conn_info.endpoint_cache_key() {
         Some(endpoint) => {
             let pool = global_pool.get_or_create_endpoint_pool(&endpoint);
-
-            pool.write().conns.push_back(ConnPoolEntry {
-                conn: client.clone(),
-                conn_id,
+            let client = ClientInnerCommon {
+                inner: client.clone(),
                 aux: aux.clone(),
+                conn_id,
+                data: ClientDataEnum::Http(ClientDataHttp()),
+            };
+            pool.write().conns.push_back(ConnPoolEntry {
+                conn: client,
+                _last_access: std::time::Instant::now(),
             });
             Metrics::get()
                 .proxy
@@ -335,23 +245,30 @@ pub(crate) fn poll_http2_client(
         .instrument(span),
     );
 
-    Client::new(client, aux)
+    let client = ClientInnerCommon {
+        inner: client,
+        aux,
+        conn_id,
+        data: ClientDataEnum::Http(ClientDataHttp()),
+    };
+
+    Client::new(client)
 }
 
 pub(crate) struct Client<C: ClientInnerExt + Clone> {
-    pub(crate) inner: C,
-    aux: MetricsAuxInfo,
+    pub(crate) inner: ClientInnerCommon<C>,
 }
 
 impl<C: ClientInnerExt + Clone> Client<C> {
-    pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self {
-        Self { inner, aux }
+    pub(self) fn new(inner: ClientInnerCommon<C>) -> Self {
+        Self { inner }
     }
 
     pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.aux;
         USAGE_METRICS.register(Ids {
-            endpoint_id: self.aux.endpoint_id,
-            branch_id: self.aux.branch_id,
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
         })
     }
 }
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 310af08221..9abe35db08 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -44,6 +44,7 @@ pub(crate) const EXT_NAME: &str = "pg_session_jwt";
 pub(crate) const EXT_VERSION: &str = "0.1.2";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
+#[derive(Clone)]
 pub(crate) struct ClientDataLocal {
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 59247f03bf..77025f419d 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -88,7 +88,7 @@ pub async fn task_main(
         }
     });
 
-    let http_conn_pool = http_conn_pool::GlobalConnPool::new(&config.http_config);
+    let http_conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config);
     {
         let http_conn_pool = Arc::clone(&http_conn_pool);
         tokio::spawn(async move {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 1b17495c5d..03b37bccd5 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -779,6 +779,7 @@ async fn handle_auth_broker_inner(
     let _metrics = client.metrics();
 
     Ok(client
+        .inner
         .inner
         .send_request(req)
         .await

From 59c2c3f8ad551b9ca71e4c21862a63585977fbde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Nov 2024 05:46:01 +0100
Subject: [PATCH 19/51] compute_ctl: print OpenTelemetry errors via tracing,
 not stdout (#9830)

Before, `OpenTelemetry` errors were printed to stdout/stderr directly,
causing one of the few log lines without a timestamp, like:

```
OpenTelemetry trace error occurred. error sending request for url (http://localhost:4318/v1/traces)
```

Now, we print:

```
2024-11-21T02:24:20.511160Z  INFO OpenTelemetry error: error sending request for url (http://localhost:4318/v1/traces)
```

I found this while investigating #9731.
---
 compute_tools/src/bin/compute_ctl.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 284db005c8..4689cc2b83 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -105,6 +105,11 @@ fn main() -> Result<()> {
 fn init() -> Result<(String, clap::ArgMatches)> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
+    opentelemetry::global::set_error_handler(|err| {
+        tracing::info!("OpenTelemetry error: {err}");
+    })
+    .expect("global error handler lock poisoned");
+
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
     thread::spawn(move || {
         for sig in signals.forever() {

From 42bda5d6323d6655dd866c01dfa680f8479d7e43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 21 Nov 2024 08:31:24 +0000
Subject: [PATCH 20/51] pageserver: revise metrics lifetime for SecondaryTenant
 (#9818)

## Problem

We saw a scale test failure when one shard went
secondary->attached->secondary in a short period of time -- the metrics
for the shard failed a validation assertion that is meant to ensure the
size metric matches the sum of layer sizes in the SecondaryDetail
struct.

This appears to be due to two SecondaryTenants being alive at the same
time -- the first one was shut down but still had its contributions to
the metrics.

Closes: https://github.com/neondatabase/neon/issues/9628

## Summary of changes

- Refactor code for validating metrics and call it in shutdown as well
as during downloads
- Move code for dropping per-tenant secondary metrics from drop() into
shutdown(), so that once shutdown() completes it is definitely safe to
instantiate another SecondaryTenant for the same tenant.
---
 pageserver/src/tenant/secondary.rs            | 30 +++++++++++------
 pageserver/src/tenant/secondary/downloader.rs | 32 ++++++++-----------
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 1331c07d05..3df89a928c 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -111,15 +111,6 @@ pub(crate) struct SecondaryTenant {
     pub(super) heatmap_total_size_metric: UIntGauge,
 }
 
-impl Drop for SecondaryTenant {
-    fn drop(&mut self) {
-        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
-        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
-        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
-        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
-    }
-}
-
 impl SecondaryTenant {
     pub(crate) fn new(
         tenant_shard_id: TenantShardId,
@@ -167,6 +158,13 @@ impl SecondaryTenant {
 
         // Wait for any secondary downloader work to complete
         self.gate.close().await;
+
+        self.validate_metrics();
+
+        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
+        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
     }
 
     pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
@@ -254,6 +252,20 @@ impl SecondaryTenant {
         .await
         .expect("secondary eviction should not have panicked");
     }
+
+    /// Exhaustive check that incrementally updated metrics match the actual state.
+    #[cfg(feature = "testing")]
+    fn validate_metrics(&self) {
+        let detail = self.detail.lock().unwrap();
+        let resident_size = detail.total_resident_size();
+
+        assert_eq!(resident_size, self.resident_size_metric.get());
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn validate_metrics(&self) {
+        // No-op in non-testing builds
+    }
 }
 
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 82c5702686..7443261a9c 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -242,6 +242,19 @@ impl SecondaryDetail {
         }
     }
 
+    #[cfg(feature = "testing")]
+    pub(crate) fn total_resident_size(&self) -> u64 {
+        self.timelines
+            .values()
+            .map(|tl| {
+                tl.on_disk_layers
+                    .values()
+                    .map(|v| v.metadata.file_size)
+                    .sum::<u64>()
+            })
+            .sum::<u64>()
+    }
+
     pub(super) fn evict_layer(
         &mut self,
         name: LayerName,
@@ -763,24 +776,7 @@ impl<'a> TenantDownloader<'a> {
         }
 
         // Metrics consistency check in testing builds
-        if cfg!(feature = "testing") {
-            let detail = self.secondary_state.detail.lock().unwrap();
-            let resident_size = detail
-                .timelines
-                .values()
-                .map(|tl| {
-                    tl.on_disk_layers
-                        .values()
-                        .map(|v| v.metadata.file_size)
-                        .sum::<u64>()
-                })
-                .sum::<u64>();
-            assert_eq!(
-                resident_size,
-                self.secondary_state.resident_size_metric.get()
-            );
-        }
-
+        self.secondary_state.validate_metrics();
         // Only update last_etag after a full successful download: this way will not skip
         // the next download, even if the heatmap's actual etag is unchanged.
         self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {

From 0713ff31767d6318e17c2993951160379e60f7e6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 21 Nov 2024 16:56:56 +0200
Subject: [PATCH 21/51] Bump Postgres version (#9808)

## Problem

I have made a mistake in merging Postgre PRs

## Summary of changes

Restore consistency of submodule referenced.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index aeecd27b1f..284ae56be2 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit aeecd27b1f0775b606409d1cbb9c8aa9853a82af
+Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 544620db4c..aed79ee87b 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 544620db4ca6945be4f1f686a7fbd2cdfb0bf96f
+Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3cc152ae2d..f5cfc6fa89 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3cc152ae2d17b19679c7102486bdb94677705c02
+Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index e5d795a1a0..3c15b6565f 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit e5d795a1a0c25da907176d37c905badab70e00c0
+Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a13ef29e45..4dae88e73d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "e5d795a1a0c25da907176d37c905badab70e00c0"
+    "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f"
   ],
   "v16": [
     "16.6",
-    "3cc152ae2d17b19679c7102486bdb94677705c02"
+    "f5cfc6fa898544050e821ac688adafece1ac3cff"
   ],
   "v15": [
     "15.10",
-    "544620db4ca6945be4f1f686a7fbd2cdfb0bf96f"
+    "aed79ee87b94779cc52ec13e3b74eba6ada93f05"
   ],
   "v14": [
     "14.15",
-    "aeecd27b1f0775b606409d1cbb9c8aa9853a82af"
+    "284ae56be2397fd3eaf20777fa220b2d0ad968f5"
   ]
 }

From 8d1c44039e5609b57ac4af103b1a0ecad4dd029c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 21 Nov 2024 16:25:31 +0000
Subject: [PATCH 22/51] Python 3.11 (#9515)

## Problem

On Debian 12 (Bookworm), Python 3.11 is the latest available version.

## Summary of changes
- Update Python to 3.11 in build-tools
- Fix ruff check / format
- Fix mypy
- Use `StrEnum` instead of pair `str`, `Enum`
- Update docs
---
 README.md                                     |   2 +-
 build-tools.Dockerfile                        |   2 +-
 docs/sourcetree.md                            |   8 +-
 poetry.lock                                   |  92 +----
 pyproject.toml                                |   7 +-
 scripts/flaky_tests.py                        |   4 +-
 scripts/force_layer_download.py               |   2 +-
 .../ingest_regress_test_result-new-format.py  |   6 +-
 test_runner/README.md                         |   2 +-
 test_runner/fixtures/auth_tokens.py           |   5 +-
 test_runner/fixtures/benchmark_fixture.py     |  33 +-
 test_runner/fixtures/common_types.py          |  11 +-
 test_runner/fixtures/compare_fixtures.py      |   2 +-
 test_runner/fixtures/compute_reconfigure.py   |   7 +-
 test_runner/fixtures/h2server.py              |   6 +-
 test_runner/fixtures/metrics.py               |  14 +-
 test_runner/fixtures/neon_api.py              |  18 +-
 test_runner/fixtures/neon_cli.py              |  69 ++--
 test_runner/fixtures/neon_fixtures.py         | 366 +++++++++---------
 .../fixtures/pageserver/common_types.py       |   4 +-
 test_runner/fixtures/pageserver/http.py       | 131 +++----
 .../fixtures/pageserver/many_tenants.py       |   3 +-
 test_runner/fixtures/pageserver/utils.py      |  34 +-
 test_runner/fixtures/parametrize.py           |  16 +-
 test_runner/fixtures/paths.py                 |  25 +-
 test_runner/fixtures/pg_version.py            |  33 +-
 test_runner/fixtures/port_distributor.py      |  18 +-
 test_runner/fixtures/remote_storage.py        |  53 ++-
 test_runner/fixtures/safekeeper/http.py       |  12 +-
 .../fixtures/storage_controller_proxy.py      |   4 +-
 test_runner/fixtures/utils.py                 |  16 +-
 test_runner/fixtures/workload.py              |  18 +-
 test_runner/performance/pageserver/util.py    |   5 +-
 test_runner/performance/test_copy.py          |   4 +-
 .../performance/test_physical_replication.py  |   4 +-
 .../test_storage_controller_scale.py          |   6 +-
 .../performance/test_wal_backpressure.py      |   3 +-
 .../regress/test_attach_tenant_config.py      |   3 +-
 test_runner/regress/test_compaction.py        |  11 +-
 test_runner/regress/test_compatibility.py     |   7 +-
 test_runner/regress/test_compute_metrics.py   |  33 +-
 test_runner/regress/test_ddl_forwarding.py    |   8 +-
 .../regress/test_disk_usage_eviction.py       |   3 +-
 .../regress/test_ingestion_layer_size.py      |  12 +-
 test_runner/regress/test_lsn_mapping.py       |  10 +-
 .../regress/test_ondemand_slru_download.py    |   6 +-
 test_runner/regress/test_pageserver_api.py    |   4 +-
 .../regress/test_pageserver_generations.py    |  13 +-
 .../regress/test_pageserver_layer_rolling.py  |   7 +-
 .../regress/test_pageserver_restart.py        |   3 +-
 .../regress/test_pageserver_secondary.py      |   6 +-
 test_runner/regress/test_pg_regress.py        |  10 +-
 test_runner/regress/test_proxy.py             |  10 +-
 test_runner/regress/test_readonly_node.py     |   3 +-
 test_runner/regress/test_remote_storage.py    |   8 +-
 test_runner/regress/test_s3_restore.py        |   6 +-
 test_runner/regress/test_sharding.py          |  15 +-
 test_runner/regress/test_sni_router.py        |  12 +-
 .../regress/test_storage_controller.py        |  14 +-
 test_runner/regress/test_storage_scrubber.py  |  14 +-
 test_runner/regress/test_tenant_detach.py     |  11 +-
 test_runner/regress/test_tenant_relocation.py |   4 +-
 test_runner/regress/test_timeline_archive.py  |   3 +-
 .../regress/test_timeline_detach_ancestor.py  |   3 +-
 .../regress/test_timeline_gc_blocking.py      |   6 +-
 test_runner/regress/test_timeline_size.py     |   3 +-
 test_runner/regress/test_wal_acceptor.py      |  22 +-
 .../regress/test_wal_acceptor_async.py        |  11 +-
 68 files changed, 567 insertions(+), 759 deletions(-)

diff --git a/README.md b/README.md
index e68ef70bdf..1417d6b9e7 100644
--- a/README.md
+++ b/README.md
@@ -132,7 +132,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
 
 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
+Python (3.11 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
 
 
 #### Running neon database
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index c1190b13f4..24e5bbf46f 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -234,7 +234,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot
 
 # Python
-ENV PYTHON_VERSION=3.9.19 \
+ENV PYTHON_VERSION=3.11.10 \
     PYENV_ROOT=/home/nonroot/.pyenv \
     PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index 3732bfdab2..1f7e913c07 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -113,21 +113,21 @@ so manual installation of dependencies is not recommended.
 A single virtual environment with all dependencies is described in the single `Pipfile`.
 
 ### Prerequisites
-- Install Python 3.9 (the minimal supported version) or greater.
+- Install Python 3.11 (the minimal supported version) or greater.
     - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected.
-    - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
+    - If you have some trouble with other version you can resolve it by installing Python 3.11 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
       ```bash
       # In Ubuntu
       sudo add-apt-repository ppa:deadsnakes/ppa
       sudo apt update
-      sudo apt install python3.9
+      sudo apt install python3.11
       ```
 - Install `poetry`
     - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation).
 - Install dependencies via `./scripts/pysync`.
     - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
       so if you have different version some linting tools can yield different result locally vs in the CI.
-    - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`.
+    - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.11`.
       This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning.
 
 Run `poetry shell` to activate the virtual environment.
diff --git a/poetry.lock b/poetry.lock
index 6171f92391..e38fc15eb7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -114,7 +114,6 @@ files = [
 [package.dependencies]
 aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
-async-timeout = {version = ">=4.0,<6.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
@@ -219,10 +218,8 @@ files = [
 ]
 
 [package.dependencies]
-exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
 idna = ">=2.8"
 sniffio = ">=1.1"
-typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
 
 [package.extras]
 doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
@@ -737,10 +734,7 @@ files = [
 [package.dependencies]
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
-urllib3 = [
-    {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
-    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
-]
+urllib3 = {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}
 
 [package.extras]
 crt = ["awscrt (==0.19.19)"]
@@ -1069,20 +1063,6 @@ docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
 ssh = ["paramiko (>=2.4.3)"]
 websockets = ["websocket-client (>=1.3.0)"]
 
-[[package]]
-name = "exceptiongroup"
-version = "1.1.1"
-description = "Backport of PEP 654 (exception groups)"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
-    {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
-]
-
-[package.extras]
-test = ["pytest (>=6)"]
-
 [[package]]
 name = "execnet"
 version = "1.9.0"
@@ -1110,7 +1090,6 @@ files = [
 
 [package.dependencies]
 click = ">=8.0"
-importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""}
 itsdangerous = ">=2.0"
 Jinja2 = ">=3.0"
 Werkzeug = ">=2.2.2"
@@ -1319,25 +1298,6 @@ files = [
     {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
-[[package]]
-name = "importlib-metadata"
-version = "4.12.0"
-description = "Read metadata from Python packages"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"},
-    {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"},
-]
-
-[package.dependencies]
-zipp = ">=0.5"
-
-[package.extras]
-docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"]
-perf = ["ipython"]
-testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
-
 [[package]]
 name = "iniconfig"
 version = "1.1.1"
@@ -1933,7 +1893,6 @@ files = [
 
 [package.dependencies]
 mypy-extensions = ">=1.0.0"
-tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 typing-extensions = ">=3.10"
 
 [package.extras]
@@ -2514,11 +2473,9 @@ files = [
 
 [package.dependencies]
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
-exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
 iniconfig = "*"
 packaging = "*"
 pluggy = ">=0.12,<2.0"
-tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
@@ -2581,10 +2538,7 @@ files = [
 ]
 
 [package.dependencies]
-pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
-]
+pytest = {version = ">=6.2.4", markers = "python_version >= \"3.10\""}
 
 [[package]]
 name = "pytest-repeat"
@@ -3092,17 +3046,6 @@ files = [
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
 
-[[package]]
-name = "tomli"
-version = "2.0.1"
-description = "A lil' TOML parser"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
-    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
-]
-
 [[package]]
 name = "types-jwcrypto"
 version = "1.5.0.20240925"
@@ -3359,16 +3302,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3523,21 +3456,6 @@ idna = ">=2.0"
 multidict = ">=4.0"
 propcache = ">=0.2.0"
 
-[[package]]
-name = "zipp"
-version = "3.19.1"
-description = "Backport of pathlib-compatible object wrapper for zip files"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
-    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
-]
-
-[package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
-
 [[package]]
 name = "zstandard"
 version = "0.21.0"
@@ -3598,5 +3516,5 @@ cffi = ["cffi (>=1.11)"]
 
 [metadata]
 lock-version = "2.0"
-python-versions = "^3.9"
-content-hash = "8cb9c38d83eec441391c0528ac2fbefde18c734373b2399e07c69382044e8ced"
+python-versions = "^3.11"
+content-hash = "5a9b8c8d409acb840c0a94dcdec6aac9777ccec443d74c78dbd511fa223cd6f6"
diff --git a/pyproject.toml b/pyproject.toml
index 197946fff8..60c6839bc7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ authors = []
 package-mode = false
 
 [tool.poetry.dependencies]
-python = "^3.9"
+python = "^3.11"
 pytest = "^7.4.4"
 psycopg2-binary = "^2.9.10"
 typing-extensions = "^4.6.1"
@@ -89,7 +89,7 @@ module = [
 ignore_missing_imports = true
 
 [tool.ruff]
-target-version = "py39"
+target-version = "py311"
 extend-exclude = [
     "vendor/",
     "target/",
@@ -108,6 +108,3 @@ select = [
     "B", # bugbear
     "UP", # pyupgrade
 ]
-
-[tool.ruff.lint.pyupgrade]
-keep-runtime-typing = true # Remove this stanza when we require Python 3.10
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 9312f8b3e7..3fb668ed2d 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -14,7 +14,7 @@ import psycopg2.extras
 import toml
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 FLAKY_TESTS_QUERY = """
     SELECT
@@ -65,7 +65,7 @@ def main(args: argparse.Namespace):
         pageserver_virtual_file_io_engine_parameter = ""
 
     # re-use existing records of flaky tests from before parametrization by compaction_algorithm
-    def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]:
+    def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
         """Duplicated from parametrize.py"""
         toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
         if toml_table is None:
diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py
index a4fd3f6132..6dbac08f3c 100644
--- a/scripts/force_layer_download.py
+++ b/scripts/force_layer_download.py
@@ -196,7 +196,7 @@ async def main_impl(args, report_out, client: Client):
             gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True)
             assert len(tenant_ids) == len(gathered)
             tenant_and_timline_ids = []
-            for tid, tlids in zip(tenant_ids, gathered):
+            for tid, tlids in zip(tenant_ids, gathered, strict=False):
                 for tlid in tlids:
                     tenant_and_timline_ids.append((tid, tlid))
         elif len(comps) == 1:
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index e0dd0a7189..c99cfa2b01 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -11,7 +11,7 @@ import re
 import sys
 from contextlib import contextmanager
 from dataclasses import dataclass
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from pathlib import Path
 
 import backoff
@@ -140,8 +140,8 @@ def ingest_test_result(
             suite=labels["suite"],
             name=unparametrized_name,
             status=test["status"],
-            started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=timezone.utc),
-            stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
+            started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=UTC),
+            stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=UTC),
             duration=test["time"]["duration"],
             flaky=test["flaky"] or test["retriesStatusChange"],
             arch=arch,
diff --git a/test_runner/README.md b/test_runner/README.md
index 55d8d2faa9..f342ef8aaa 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -113,7 +113,7 @@ The test suite has a Python enum with equal name but different meaning:
 
 ```python
 @enum.unique
-class RemoteStorageKind(str, enum.Enum):
+class RemoteStorageKind(StrEnum):
     LOCAL_FS = "local_fs"
     MOCK_S3 = "mock_s3"
     REAL_S3 = "real_s3"
diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py
index be16be81de..8382ce20b3 100644
--- a/test_runner/fixtures/auth_tokens.py
+++ b/test_runner/fixtures/auth_tokens.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from enum import Enum
+from enum import StrEnum
 from typing import Any
 
 import jwt
@@ -37,8 +37,7 @@ class AuthKeys:
         return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
 
 
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class TokenScope(str, Enum):
+class TokenScope(StrEnum):
     ADMIN = "admin"
     PAGE_SERVER_API = "pageserverapi"
     GENERATIONS_API = "generations_api"
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 8e68775471..bb8e75902e 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -9,6 +9,7 @@ import re
 import timeit
 from contextlib import contextmanager
 from datetime import datetime
+from enum import StrEnum
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -24,8 +25,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Mapping
-    from typing import Callable, Optional
+    from collections.abc import Callable, Iterator, Mapping
 
 
 """
@@ -61,7 +61,7 @@ class PgBenchRunResult:
     number_of_threads: int
     number_of_transactions_actually_processed: int
     latency_average: float
-    latency_stddev: Optional[float]
+    latency_stddev: float | None
     tps: float
     run_duration: float
     run_start_timestamp: int
@@ -171,14 +171,14 @@ _PGBENCH_INIT_EXTRACTORS: Mapping[str, re.Pattern[str]] = {
 
 @dataclasses.dataclass
 class PgBenchInitResult:
-    total: Optional[float]
-    drop_tables: Optional[float]
-    create_tables: Optional[float]
-    client_side_generate: Optional[float]
-    server_side_generate: Optional[float]
-    vacuum: Optional[float]
-    primary_keys: Optional[float]
-    foreign_keys: Optional[float]
+    total: float | None
+    drop_tables: float | None
+    create_tables: float | None
+    client_side_generate: float | None
+    server_side_generate: float | None
+    vacuum: float | None
+    primary_keys: float | None
+    foreign_keys: float | None
     duration: float
     start_timestamp: int
     end_timestamp: int
@@ -196,7 +196,7 @@ class PgBenchInitResult:
 
         last_line = stderr.splitlines()[-1]
 
-        timings: dict[str, Optional[float]] = {}
+        timings: dict[str, float | None] = {}
         last_line_items = re.split(r"\(|\)|,", last_line)
         for item in last_line_items:
             for key, regex in _PGBENCH_INIT_EXTRACTORS.items():
@@ -227,7 +227,7 @@ class PgBenchInitResult:
 
 
 @enum.unique
-class MetricReport(str, enum.Enum):  # str is a hack to make it json serializable
+class MetricReport(StrEnum):  # str is a hack to make it json serializable
     # this means that this is a constant test parameter
     # like number of transactions, or number of clients
     TEST_PARAM = "test_param"
@@ -256,9 +256,8 @@ class NeonBenchmarker:
         metric_value: float,
         unit: str,
         report: MetricReport,
-        labels: Optional[
-            dict[str, str]
-        ] = None,  # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric
+        # use this to associate additional key/value pairs in json format for associated Neon object IDs like project ID with the metric
+        labels: dict[str, str] | None = None,
     ):
         """
         Record a benchmark result.
@@ -412,7 +411,7 @@ class NeonBenchmarker:
         self,
         pageserver: NeonPageserver,
         metric_name: str,
-        label_filters: Optional[dict[str, str]] = None,
+        label_filters: dict[str, str] | None = None,
     ) -> int:
         """Fetch the value of given int counter from pageserver metrics."""
         all_metrics = pageserver.http_client().get_metrics()
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 0ea7148f50..212ed9207f 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -2,14 +2,14 @@ from __future__ import annotations
 
 import random
 from dataclasses import dataclass
-from enum import Enum
+from enum import StrEnum
 from functools import total_ordering
 from typing import TYPE_CHECKING, TypeVar
 
 from typing_extensions import override
 
 if TYPE_CHECKING:
-    from typing import Any, Union
+    from typing import Any
 
     T = TypeVar("T", bound="Id")
 
@@ -24,7 +24,7 @@ class Lsn:
     representation is like "1/0123abcd". See also pg_lsn datatype in Postgres
     """
 
-    def __init__(self, x: Union[int, str]):
+    def __init__(self, x: int | str):
         if isinstance(x, int):
             self.lsn_int = x
         else:
@@ -67,7 +67,7 @@ class Lsn:
             return NotImplemented
         return self.lsn_int - other.lsn_int
 
-    def __add__(self, other: Union[int, Lsn]) -> Lsn:
+    def __add__(self, other: int | Lsn) -> Lsn:
         if isinstance(other, int):
             return Lsn(self.lsn_int + other)
         elif isinstance(other, Lsn):
@@ -249,7 +249,6 @@ class TenantShardId:
         return hash(self._tuple())
 
 
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class TimelineArchivalState(str, Enum):
+class TimelineArchivalState(StrEnum):
     ARCHIVED = "Archived"
     UNARCHIVED = "Unarchived"
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 85b6e7a3b8..c0892399bd 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -99,7 +99,7 @@ class PgCompare(ABC):
                 assert row is not None
                 assert len(row) == len(pg_stat.columns)
 
-                for col, val in zip(pg_stat.columns, row):
+                for col, val in zip(pg_stat.columns, row, strict=False):
                     results[f"{pg_stat.table}.{col}"] = int(val)
 
         return results
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index 6354b7f833..4175f67ecb 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -12,7 +12,8 @@ from fixtures.common_types import TenantId
 from fixtures.log_helper import log
 
 if TYPE_CHECKING:
-    from typing import Any, Callable, Optional
+    from collections.abc import Callable
+    from typing import Any
 
 
 class ComputeReconfigure:
@@ -20,12 +21,12 @@ class ComputeReconfigure:
         self.server = server
         self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
         self.workloads: dict[TenantId, Any] = {}
-        self.on_notify: Optional[Callable[[Any], None]] = None
+        self.on_notify: Callable[[Any], None] | None = None
 
     def register_workload(self, workload: Any):
         self.workloads[workload.tenant_id] = workload
 
-    def register_on_notify(self, fn: Optional[Callable[[Any], None]]):
+    def register_on_notify(self, fn: Callable[[Any], None] | None):
         """
         Add some extra work during a notification, like sleeping to slow things down, or
         logging what was notified.
diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py
index e890b2bcf1..3e35af3b5b 100644
--- a/test_runner/fixtures/h2server.py
+++ b/test_runner/fixtures/h2server.py
@@ -31,7 +31,7 @@ from h2.settings import SettingCodes
 from typing_extensions import override
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 RequestData = collections.namedtuple("RequestData", ["headers", "data"])
@@ -49,7 +49,7 @@ class H2Protocol(asyncio.Protocol):
     def __init__(self):
         config = H2Configuration(client_side=False, header_encoding="utf-8")
         self.conn = H2Connection(config=config)
-        self.transport: Optional[asyncio.Transport] = None
+        self.transport: asyncio.Transport | None = None
         self.stream_data: dict[int, RequestData] = {}
         self.flow_control_futures: dict[int, asyncio.Future[Any]] = {}
 
@@ -61,7 +61,7 @@ class H2Protocol(asyncio.Protocol):
         self.transport.write(self.conn.data_to_send())
 
     @override
-    def connection_lost(self, exc: Optional[Exception]):
+    def connection_lost(self, exc: Exception | None):
         for future in self.flow_control_futures.values():
             future.cancel()
         self.flow_control_futures = {}
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 39c8f70a9c..330f007a77 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -1,16 +1,12 @@
 from __future__ import annotations
 
 from collections import defaultdict
-from typing import TYPE_CHECKING
 
 from prometheus_client.parser import text_string_to_metric_families
 from prometheus_client.samples import Sample
 
 from fixtures.log_helper import log
 
-if TYPE_CHECKING:
-    from typing import Optional
-
 
 class Metrics:
     metrics: dict[str, list[Sample]]
@@ -20,7 +16,7 @@ class Metrics:
         self.metrics = defaultdict(list)
         self.name = name
 
-    def query_all(self, name: str, filter: Optional[dict[str, str]] = None) -> list[Sample]:
+    def query_all(self, name: str, filter: dict[str, str] | None = None) -> list[Sample]:
         filter = filter or {}
         res: list[Sample] = []
 
@@ -32,7 +28,7 @@ class Metrics:
                 pass
         return res
 
-    def query_one(self, name: str, filter: Optional[dict[str, str]] = None) -> Sample:
+    def query_one(self, name: str, filter: dict[str, str] | None = None) -> Sample:
         res = self.query_all(name, filter or {})
         assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
         return res[0]
@@ -47,9 +43,7 @@ class MetricsGetter:
     def get_metrics(self) -> Metrics:
         raise NotImplementedError()
 
-    def get_metric_value(
-        self, name: str, filter: Optional[dict[str, str]] = None
-    ) -> Optional[float]:
+    def get_metric_value(self, name: str, filter: dict[str, str] | None = None) -> float | None:
         metrics = self.get_metrics()
         results = metrics.query_all(name, filter=filter)
         if not results:
@@ -59,7 +53,7 @@ class MetricsGetter:
         return results[0].value
 
     def get_metrics_values(
-        self, names: list[str], filter: Optional[dict[str, str]] = None, absence_ok: bool = False
+        self, names: list[str], filter: dict[str, str] | None = None, absence_ok: bool = False
     ) -> dict[str, float]:
         """
         When fetching multiple named metrics, it is more efficient to use this
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 9de6681beb..df80f0683c 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -8,7 +8,7 @@ import requests
 from fixtures.log_helper import log
 
 if TYPE_CHECKING:
-    from typing import Any, Literal, Optional
+    from typing import Any, Literal
 
     from fixtures.pg_version import PgVersion
 
@@ -40,11 +40,11 @@ class NeonAPI:
 
     def create_project(
         self,
-        pg_version: Optional[PgVersion] = None,
-        name: Optional[str] = None,
-        branch_name: Optional[str] = None,
-        branch_role_name: Optional[str] = None,
-        branch_database_name: Optional[str] = None,
+        pg_version: PgVersion | None = None,
+        name: str | None = None,
+        branch_name: str | None = None,
+        branch_role_name: str | None = None,
+        branch_database_name: str | None = None,
     ) -> dict[str, Any]:
         data: dict[str, Any] = {
             "project": {
@@ -179,8 +179,8 @@ class NeonAPI:
     def get_connection_uri(
         self,
         project_id: str,
-        branch_id: Optional[str] = None,
-        endpoint_id: Optional[str] = None,
+        branch_id: str | None = None,
+        endpoint_id: str | None = None,
         database_name: str = "neondb",
         role_name: str = "neondb_owner",
         pooled: bool = True,
@@ -249,7 +249,7 @@ class NeonAPI:
 
 @final
 class NeonApiEndpoint:
-    def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]):
+    def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: str | None):
         self.neon_api = neon_api
         self.project_id: str
         self.endpoint_id: str
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index d220ea57a2..03a02f51fd 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -20,7 +20,6 @@ from fixtures.pg_version import PgVersion
 if TYPE_CHECKING:
     from typing import (
         Any,
-        Optional,
         TypeVar,
         cast,
     )
@@ -36,7 +35,7 @@ class AbstractNeonCli:
     Do not use directly, use specific subclasses instead.
     """
 
-    def __init__(self, extra_env: Optional[dict[str, str]], binpath: Path):
+    def __init__(self, extra_env: dict[str, str] | None, binpath: Path):
         self.extra_env = extra_env
         self.binpath = binpath
 
@@ -45,7 +44,7 @@ class AbstractNeonCli:
     def raw_cli(
         self,
         arguments: list[str],
-        extra_env_vars: Optional[dict[str, str]] = None,
+        extra_env_vars: dict[str, str] | None = None,
         check_return_code=True,
         timeout=None,
     ) -> subprocess.CompletedProcess[str]:
@@ -173,7 +172,7 @@ class NeonLocalCli(AbstractNeonCli):
 
     def __init__(
         self,
-        extra_env: Optional[dict[str, str]],
+        extra_env: dict[str, str] | None,
         binpath: Path,
         repo_dir: Path,
         pg_distrib_dir: Path,
@@ -195,10 +194,10 @@ class NeonLocalCli(AbstractNeonCli):
         tenant_id: TenantId,
         timeline_id: TimelineId,
         pg_version: PgVersion,
-        conf: Optional[dict[str, Any]] = None,
-        shard_count: Optional[int] = None,
-        shard_stripe_size: Optional[int] = None,
-        placement_policy: Optional[str] = None,
+        conf: dict[str, Any] | None = None,
+        shard_count: int | None = None,
+        shard_stripe_size: int | None = None,
+        placement_policy: str | None = None,
         set_default: bool = False,
     ):
         """
@@ -302,8 +301,8 @@ class NeonLocalCli(AbstractNeonCli):
         tenant_id: TenantId,
         timeline_id: TimelineId,
         new_branch_name,
-        ancestor_branch_name: Optional[str] = None,
-        ancestor_start_lsn: Optional[Lsn] = None,
+        ancestor_branch_name: str | None = None,
+        ancestor_start_lsn: Lsn | None = None,
     ):
         cmd = [
             "timeline",
@@ -331,8 +330,8 @@ class NeonLocalCli(AbstractNeonCli):
         base_lsn: Lsn,
         base_tarfile: Path,
         pg_version: PgVersion,
-        end_lsn: Optional[Lsn] = None,
-        wal_tarfile: Optional[Path] = None,
+        end_lsn: Lsn | None = None,
+        wal_tarfile: Path | None = None,
     ):
         cmd = [
             "timeline",
@@ -380,7 +379,7 @@ class NeonLocalCli(AbstractNeonCli):
     def init(
         self,
         init_config: dict[str, Any],
-        force: Optional[str] = None,
+        force: str | None = None,
     ) -> subprocess.CompletedProcess[str]:
         with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile:
             init_config_tmpfile.write(toml.dumps(init_config))
@@ -400,9 +399,9 @@ class NeonLocalCli(AbstractNeonCli):
 
     def storage_controller_start(
         self,
-        timeout_in_seconds: Optional[int] = None,
-        instance_id: Optional[int] = None,
-        base_port: Optional[int] = None,
+        timeout_in_seconds: int | None = None,
+        instance_id: int | None = None,
+        base_port: int | None = None,
     ):
         cmd = ["storage_controller", "start"]
         if timeout_in_seconds is not None:
@@ -413,7 +412,7 @@ class NeonLocalCli(AbstractNeonCli):
             cmd.append(f"--base-port={base_port}")
         return self.raw_cli(cmd)
 
-    def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None):
+    def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):
         cmd = ["storage_controller", "stop"]
         if immediate:
             cmd.extend(["-m", "immediate"])
@@ -424,8 +423,8 @@ class NeonLocalCli(AbstractNeonCli):
     def pageserver_start(
         self,
         id: int,
-        extra_env_vars: Optional[dict[str, str]] = None,
-        timeout_in_seconds: Optional[int] = None,
+        extra_env_vars: dict[str, str] | None = None,
+        timeout_in_seconds: int | None = None,
     ) -> subprocess.CompletedProcess[str]:
         start_args = ["pageserver", "start", f"--id={id}"]
         if timeout_in_seconds is not None:
@@ -442,9 +441,9 @@ class NeonLocalCli(AbstractNeonCli):
     def safekeeper_start(
         self,
         id: int,
-        extra_opts: Optional[list[str]] = None,
-        extra_env_vars: Optional[dict[str, str]] = None,
-        timeout_in_seconds: Optional[int] = None,
+        extra_opts: list[str] | None = None,
+        extra_env_vars: dict[str, str] | None = None,
+        timeout_in_seconds: int | None = None,
     ) -> subprocess.CompletedProcess[str]:
         if extra_opts is not None:
             extra_opts = [f"-e={opt}" for opt in extra_opts]
@@ -457,7 +456,7 @@ class NeonLocalCli(AbstractNeonCli):
         )
 
     def safekeeper_stop(
-        self, id: Optional[int] = None, immediate=False
+        self, id: int | None = None, immediate=False
     ) -> subprocess.CompletedProcess[str]:
         args = ["safekeeper", "stop"]
         if id is not None:
@@ -467,7 +466,7 @@ class NeonLocalCli(AbstractNeonCli):
         return self.raw_cli(args)
 
     def storage_broker_start(
-        self, timeout_in_seconds: Optional[int] = None
+        self, timeout_in_seconds: int | None = None
     ) -> subprocess.CompletedProcess[str]:
         cmd = ["storage_broker", "start"]
         if timeout_in_seconds is not None:
@@ -485,10 +484,10 @@ class NeonLocalCli(AbstractNeonCli):
         http_port: int,
         tenant_id: TenantId,
         pg_version: PgVersion,
-        endpoint_id: Optional[str] = None,
+        endpoint_id: str | None = None,
         hot_standby: bool = False,
-        lsn: Optional[Lsn] = None,
-        pageserver_id: Optional[int] = None,
+        lsn: Lsn | None = None,
+        pageserver_id: int | None = None,
         allow_multiple=False,
     ) -> subprocess.CompletedProcess[str]:
         args = [
@@ -523,11 +522,11 @@ class NeonLocalCli(AbstractNeonCli):
     def endpoint_start(
         self,
         endpoint_id: str,
-        safekeepers: Optional[list[int]] = None,
-        remote_ext_config: Optional[str] = None,
-        pageserver_id: Optional[int] = None,
+        safekeepers: list[int] | None = None,
+        remote_ext_config: str | None = None,
+        pageserver_id: int | None = None,
         allow_multiple=False,
-        basebackup_request_tries: Optional[int] = None,
+        basebackup_request_tries: int | None = None,
     ) -> subprocess.CompletedProcess[str]:
         args = [
             "endpoint",
@@ -555,9 +554,9 @@ class NeonLocalCli(AbstractNeonCli):
     def endpoint_reconfigure(
         self,
         endpoint_id: str,
-        tenant_id: Optional[TenantId] = None,
-        pageserver_id: Optional[int] = None,
-        safekeepers: Optional[list[int]] = None,
+        tenant_id: TenantId | None = None,
+        pageserver_id: int | None = None,
+        safekeepers: list[int] | None = None,
         check_return_code=True,
     ) -> subprocess.CompletedProcess[str]:
         args = ["endpoint", "reconfigure", endpoint_id]
@@ -574,7 +573,7 @@ class NeonLocalCli(AbstractNeonCli):
         endpoint_id: str,
         destroy=False,
         check_return_code=True,
-        mode: Optional[str] = None,
+        mode: str | None = None,
     ) -> subprocess.CompletedProcess[str]:
         args = [
             "endpoint",
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 205a47a9d5..195b788c7e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -17,7 +17,7 @@ from collections.abc import Iterable, Iterator
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
-from enum import Enum
+from enum import StrEnum
 from functools import cached_property
 from pathlib import Path
 from types import TracebackType
@@ -101,12 +101,10 @@ from fixtures.utils import (
 from .neon_api import NeonAPI, NeonApiEndpoint
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from typing import (
         Any,
-        Callable,
-        Optional,
         TypeVar,
-        Union,
     )
 
     from fixtures.paths import SnapshotDirLocked
@@ -338,10 +336,10 @@ class NeonEnvBuilder:
         top_output_dir: Path,
         test_output_dir: Path,
         combination,
-        test_overlay_dir: Optional[Path] = None,
-        pageserver_remote_storage: Optional[RemoteStorage] = None,
+        test_overlay_dir: Path | None = None,
+        pageserver_remote_storage: RemoteStorage | None = None,
         # toml that will be decomposed into `--config-override` flags during `pageserver --init`
-        pageserver_config_override: Optional[str | Callable[[dict[str, Any]], None]] = None,
+        pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None,
         num_safekeepers: int = 1,
         num_pageservers: int = 1,
         # Use non-standard SK ids to check for various parsing bugs
@@ -349,16 +347,16 @@ class NeonEnvBuilder:
         # fsync is disabled by default to make the tests go faster
         safekeepers_enable_fsync: bool = False,
         auth_enabled: bool = False,
-        rust_log_override: Optional[str] = None,
+        rust_log_override: str | None = None,
         default_branch_name: str = DEFAULT_BRANCH_NAME,
         preserve_database_files: bool = False,
-        initial_tenant: Optional[TenantId] = None,
-        initial_timeline: Optional[TimelineId] = None,
-        pageserver_virtual_file_io_engine: Optional[str] = None,
-        pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None,
-        safekeeper_extra_opts: Optional[list[str]] = None,
-        storage_controller_port_override: Optional[int] = None,
-        pageserver_virtual_file_io_mode: Optional[str] = None,
+        initial_tenant: TenantId | None = None,
+        initial_timeline: TimelineId | None = None,
+        pageserver_virtual_file_io_engine: str | None = None,
+        pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = None,
+        safekeeper_extra_opts: list[str] | None = None,
+        storage_controller_port_override: int | None = None,
+        pageserver_virtual_file_io_mode: str | None = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -367,7 +365,7 @@ class NeonEnvBuilder:
         # Pageserver remote storage
         self.pageserver_remote_storage = pageserver_remote_storage
         # Safekeepers remote storage
-        self.safekeepers_remote_storage: Optional[RemoteStorage] = None
+        self.safekeepers_remote_storage: RemoteStorage | None = None
 
         self.run_id = run_id
         self.mock_s3_server: MockS3Server = mock_s3_server
@@ -378,7 +376,7 @@ class NeonEnvBuilder:
         self.safekeepers_enable_fsync = safekeepers_enable_fsync
         self.auth_enabled = auth_enabled
         self.default_branch_name = default_branch_name
-        self.env: Optional[NeonEnv] = None
+        self.env: NeonEnv | None = None
         self.keep_remote_storage_contents: bool = True
         self.neon_binpath = neon_binpath
         self.neon_local_binpath = neon_binpath
@@ -391,14 +389,14 @@ class NeonEnvBuilder:
         self.test_output_dir = test_output_dir
         self.test_overlay_dir = test_overlay_dir
         self.overlay_mounts_created_by_us: list[tuple[str, Path]] = []
-        self.config_init_force: Optional[str] = None
+        self.config_init_force: str | None = None
         self.top_output_dir = top_output_dir
-        self.control_plane_compute_hook_api: Optional[str] = None
-        self.storage_controller_config: Optional[dict[Any, Any]] = None
+        self.control_plane_compute_hook_api: str | None = None
+        self.storage_controller_config: dict[Any, Any] | None = None
 
-        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
+        self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine
 
-        self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = (
+        self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = (
             pageserver_default_tenant_config_compaction_algorithm
         )
         if self.pageserver_default_tenant_config_compaction_algorithm is not None:
@@ -440,10 +438,10 @@ class NeonEnvBuilder:
 
     def init_start(
         self,
-        initial_tenant_conf: Optional[dict[str, Any]] = None,
+        initial_tenant_conf: dict[str, Any] | None = None,
         default_remote_storage_if_missing: bool = True,
-        initial_tenant_shard_count: Optional[int] = None,
-        initial_tenant_shard_stripe_size: Optional[int] = None,
+        initial_tenant_shard_count: int | None = None,
+        initial_tenant_shard_stripe_size: int | None = None,
     ) -> NeonEnv:
         """
         Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
@@ -781,8 +779,8 @@ class NeonEnvBuilder:
         self,
         kind: RemoteStorageKind,
         user: RemoteStorageUser,
-        bucket_name: Optional[str] = None,
-        bucket_region: Optional[str] = None,
+        bucket_name: str | None = None,
+        bucket_region: str | None = None,
     ) -> RemoteStorage:
         ret = kind.configure(
             self.repo_dir,
@@ -845,9 +843,9 @@ class NeonEnvBuilder:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
     ):
         # Stop all the nodes.
         if self.env:
@@ -1136,7 +1134,7 @@ class NeonEnv:
             force=config.config_init_force,
         )
 
-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(self, timeout_in_seconds: int | None = None):
         # Storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
         self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
@@ -1237,7 +1235,7 @@ class NeonEnv:
         ), "env.pageserver must only be used with single pageserver NeonEnv"
         return self.pageservers[0]
 
-    def get_pageserver(self, id: Optional[int]) -> NeonPageserver:
+    def get_pageserver(self, id: int | None) -> NeonPageserver:
         """
         Look up a pageserver by its node ID.
 
@@ -1254,7 +1252,7 @@ class NeonEnv:
 
         raise RuntimeError(f"Pageserver with ID {id} not found")
 
-    def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]):
+    def get_tenant_pageserver(self, tenant_id: TenantId | TenantShardId):
         """
         Get the NeonPageserver where this tenant shard is currently attached, according
         to the storage controller.
@@ -1316,12 +1314,12 @@ class NeonEnv:
 
     def create_tenant(
         self,
-        tenant_id: Optional[TenantId] = None,
-        timeline_id: Optional[TimelineId] = None,
-        conf: Optional[dict[str, Any]] = None,
-        shard_count: Optional[int] = None,
-        shard_stripe_size: Optional[int] = None,
-        placement_policy: Optional[str] = None,
+        tenant_id: TenantId | None = None,
+        timeline_id: TimelineId | None = None,
+        conf: dict[str, Any] | None = None,
+        shard_count: int | None = None,
+        shard_stripe_size: int | None = None,
+        placement_policy: str | None = None,
         set_default: bool = False,
     ) -> tuple[TenantId, TimelineId]:
         """
@@ -1343,7 +1341,7 @@ class NeonEnv:
 
         return tenant_id, timeline_id
 
-    def config_tenant(self, tenant_id: Optional[TenantId], conf: dict[str, str]):
+    def config_tenant(self, tenant_id: TenantId | None, conf: dict[str, str]):
         """
         Update tenant config.
         """
@@ -1353,10 +1351,10 @@ class NeonEnv:
     def create_branch(
         self,
         new_branch_name: str = DEFAULT_BRANCH_NAME,
-        tenant_id: Optional[TenantId] = None,
-        ancestor_branch_name: Optional[str] = None,
-        ancestor_start_lsn: Optional[Lsn] = None,
-        new_timeline_id: Optional[TimelineId] = None,
+        tenant_id: TenantId | None = None,
+        ancestor_branch_name: str | None = None,
+        ancestor_start_lsn: Lsn | None = None,
+        new_timeline_id: TimelineId | None = None,
     ) -> TimelineId:
         new_timeline_id = new_timeline_id or TimelineId.generate()
         tenant_id = tenant_id or self.initial_tenant
@@ -1370,8 +1368,8 @@ class NeonEnv:
     def create_timeline(
         self,
         new_branch_name: str,
-        tenant_id: Optional[TenantId] = None,
-        timeline_id: Optional[TimelineId] = None,
+        tenant_id: TenantId | None = None,
+        timeline_id: TimelineId | None = None,
     ) -> TimelineId:
         timeline_id = timeline_id or TimelineId.generate()
         tenant_id = tenant_id or self.initial_tenant
@@ -1396,8 +1394,8 @@ def neon_simple_env(
     compatibility_pg_distrib_dir: Path,
     pg_version: PgVersion,
     pageserver_virtual_file_io_engine: str,
-    pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
-    pageserver_virtual_file_io_mode: Optional[str],
+    pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None,
+    pageserver_virtual_file_io_mode: str | None,
 ) -> Iterator[NeonEnv]:
     """
     Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync.
@@ -1453,9 +1451,9 @@ def neon_env_builder(
     test_overlay_dir: Path,
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
-    pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
+    pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None,
     record_property: Callable[[str, object], None],
-    pageserver_virtual_file_io_mode: Optional[str],
+    pageserver_virtual_file_io_mode: str | None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1530,7 +1528,7 @@ class LogUtils:
 
     def log_contains(
         self, pattern: str, offset: None | LogCursor = None
-    ) -> Optional[tuple[str, LogCursor]]:
+    ) -> tuple[str, LogCursor] | None:
         """Check that the log contains a line that matches the given regex"""
         logfile = self.logfile
         if not logfile.exists():
@@ -1569,14 +1567,13 @@ class StorageControllerApiException(Exception):
 
 # See libs/pageserver_api/src/controller_api.rs
 # for the rust definitions of the enums below
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class PageserverAvailability(str, Enum):
+class PageserverAvailability(StrEnum):
     ACTIVE = "Active"
     UNAVAILABLE = "Unavailable"
     OFFLINE = "Offline"
 
 
-class PageserverSchedulingPolicy(str, Enum):
+class PageserverSchedulingPolicy(StrEnum):
     ACTIVE = "Active"
     DRAINING = "Draining"
     FILLING = "Filling"
@@ -1584,7 +1581,7 @@ class PageserverSchedulingPolicy(str, Enum):
     PAUSE_FOR_RESTART = "PauseForRestart"
 
 
-class StorageControllerLeadershipStatus(str, Enum):
+class StorageControllerLeadershipStatus(StrEnum):
     LEADER = "leader"
     STEPPED_DOWN = "stepped_down"
     CANDIDATE = "candidate"
@@ -1602,9 +1599,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def start(
         self,
-        timeout_in_seconds: Optional[int] = None,
-        instance_id: Optional[int] = None,
-        base_port: Optional[int] = None,
+        timeout_in_seconds: int | None = None,
+        instance_id: int | None = None,
+        base_port: int | None = None,
     ):
         assert not self.running
         self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
@@ -1673,7 +1670,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         return resp
 
-    def headers(self, scope: Optional[TokenScope]) -> dict[str, str]:
+    def headers(self, scope: TokenScope | None) -> dict[str, str]:
         headers = {}
         if self.auth_enabled and scope is not None:
             jwt_token = self.env.auth_keys.generate_token(scope=scope)
@@ -1711,9 +1708,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def attach_hook_issue(
         self,
-        tenant_shard_id: Union[TenantId, TenantShardId],
+        tenant_shard_id: TenantId | TenantShardId,
         pageserver_id: int,
-        generation_override: Optional[int] = None,
+        generation_override: int | None = None,
     ) -> int:
         body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
         if generation_override is not None:
@@ -1729,7 +1726,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         assert isinstance(gen, int)
         return gen
 
-    def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
+    def attach_hook_drop(self, tenant_shard_id: TenantId | TenantShardId):
         self.request(
             "POST",
             f"{self.api}/debug/v1/attach-hook",
@@ -1737,7 +1734,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
-    def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]:
+    def inspect(self, tenant_shard_id: TenantId | TenantShardId) -> tuple[int, int] | None:
         """
         :return: 2-tuple of (generation, pageserver id), or None if unknown
         """
@@ -1857,10 +1854,10 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_create(
         self,
         tenant_id: TenantId,
-        shard_count: Optional[int] = None,
-        shard_stripe_size: Optional[int] = None,
-        tenant_config: Optional[dict[Any, Any]] = None,
-        placement_policy: Optional[Union[dict[Any, Any], str]] = None,
+        shard_count: int | None = None,
+        shard_stripe_size: int | None = None,
+        tenant_config: dict[Any, Any] | None = None,
+        placement_policy: dict[Any, Any] | str | None = None,
     ):
         """
         Use this rather than pageserver_api() when you need to include shard parameters
@@ -1941,7 +1938,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         return response.json()
 
     def tenant_shard_split(
-        self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
+        self, tenant_id: TenantId, shard_count: int, shard_stripe_size: int | None = None
     ) -> list[TenantShardId]:
         response = self.request(
             "PUT",
@@ -2039,8 +2036,8 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def poll_node_status(
         self,
         node_id: int,
-        desired_availability: Optional[PageserverAvailability],
-        desired_scheduling_policy: Optional[PageserverSchedulingPolicy],
+        desired_availability: PageserverAvailability | None,
+        desired_scheduling_policy: PageserverSchedulingPolicy | None,
         max_attempts: int,
         backoff: float,
     ):
@@ -2259,7 +2256,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             json=body,
         )
 
-    def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]:
+    def get_safekeeper(self, id: int) -> dict[str, Any] | None:
         try:
             response = self.request(
                 "GET",
@@ -2290,9 +2287,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         self.stop(immediate=True)
 
@@ -2304,9 +2301,9 @@ class NeonProxiedStorageController(NeonStorageController):
 
     def start(
         self,
-        timeout_in_seconds: Optional[int] = None,
-        instance_id: Optional[int] = None,
-        base_port: Optional[int] = None,
+        timeout_in_seconds: int | None = None,
+        instance_id: int | None = None,
+        base_port: int | None = None,
     ):
         assert instance_id is not None and base_port is not None
 
@@ -2317,7 +2314,7 @@ class NeonProxiedStorageController(NeonStorageController):
         return self
 
     def stop_instance(
-        self, immediate: bool = False, instance_id: Optional[int] = None
+        self, immediate: bool = False, instance_id: int | None = None
     ) -> NeonStorageController:
         assert instance_id in self.instances
         if self.instances[instance_id]["running"]:
@@ -2346,7 +2343,7 @@ class NeonProxiedStorageController(NeonStorageController):
 
     def log_contains(
         self, pattern: str, offset: None | LogCursor = None
-    ) -> Optional[tuple[str, LogCursor]]:
+    ) -> tuple[str, LogCursor] | None:
         raise NotImplementedError()
 
 
@@ -2393,8 +2390,8 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     def timeline_dir(
         self,
-        tenant_shard_id: Union[TenantId, TenantShardId],
-        timeline_id: Optional[TimelineId] = None,
+        tenant_shard_id: TenantId | TenantShardId,
+        timeline_id: TimelineId | None = None,
     ) -> Path:
         """Get a timeline directory's path based on the repo directory of the test environment"""
         if timeline_id is None:
@@ -2403,7 +2400,7 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     def tenant_dir(
         self,
-        tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None,
+        tenant_shard_id: TenantId | TenantShardId | None = None,
     ) -> Path:
         """Get a tenant directory's path based on the repo directory of the test environment"""
         if tenant_shard_id is None:
@@ -2447,8 +2444,8 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     def start(
         self,
-        extra_env_vars: Optional[dict[str, str]] = None,
-        timeout_in_seconds: Optional[int] = None,
+        extra_env_vars: dict[str, str] | None = None,
+        timeout_in_seconds: int | None = None,
     ) -> NeonPageserver:
         """
         Start the page server.
@@ -2497,7 +2494,7 @@ class NeonPageserver(PgProtocol, LogUtils):
     def restart(
         self,
         immediate: bool = False,
-        timeout_in_seconds: Optional[int] = None,
+        timeout_in_seconds: int | None = None,
     ):
         """
         High level wrapper for restart: restarts the process, and waits for
@@ -2537,9 +2534,9 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         self.stop(immediate=True)
 
@@ -2548,7 +2545,7 @@ class NeonPageserver(PgProtocol, LogUtils):
             pytest.skip("pageserver was built without 'testing' feature")
 
     def http_client(
-        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
+        self, auth_token: str | None = None, retries: Retry | None = None
     ) -> PageserverHttpClient:
         return PageserverHttpClient(
             port=self.service_port.http,
@@ -2585,7 +2582,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         self,
         tenant_id: TenantId,
         config: None | dict[str, Any] = None,
-        generation: Optional[int] = None,
+        generation: int | None = None,
         override_storage_controller_generation: bool = False,
     ):
         """
@@ -2619,7 +2616,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         return client.tenant_location_conf(tenant_id, config, **kwargs)
 
     def read_tenant_location_conf(
-        self, tenant_shard_id: Union[TenantId, TenantShardId]
+        self, tenant_shard_id: TenantId | TenantShardId
     ) -> dict[str, Any]:
         path = self.tenant_dir(tenant_shard_id) / "config-v1"
         log.info(f"Reading location conf from {path}")
@@ -2634,9 +2631,9 @@ class NeonPageserver(PgProtocol, LogUtils):
     def tenant_create(
         self,
         tenant_id: TenantId,
-        conf: Optional[dict[str, Any]] = None,
-        auth_token: Optional[str] = None,
-        generation: Optional[int] = None,
+        conf: dict[str, Any] | None = None,
+        auth_token: str | None = None,
+        generation: int | None = None,
     ) -> TenantId:
         if generation is None:
             generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
@@ -2656,7 +2653,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         return tenant_id
 
     def list_layers(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId
     ) -> list[Path]:
         """
         Inspect local storage on a pageserver to discover which layer files are present.
@@ -2749,7 +2746,7 @@ class PgBin:
         if "/" not in str(command[0]):
             command[0] = str(self.pg_bin_path / command[0])
 
-    def _build_env(self, env_add: Optional[Env]) -> Env:
+    def _build_env(self, env_add: Env | None) -> Env:
         if env_add is None:
             return self.env
         env = self.env.copy()
@@ -2766,8 +2763,8 @@ class PgBin:
     def run_nonblocking(
         self,
         command: list[str],
-        env: Optional[Env] = None,
-        cwd: Optional[Union[str, Path]] = None,
+        env: Env | None = None,
+        cwd: str | Path | None = None,
     ) -> subprocess.Popen[Any]:
         """
         Run one of the postgres binaries, not waiting for it to finish
@@ -2790,8 +2787,8 @@ class PgBin:
     def run(
         self,
         command: list[str],
-        env: Optional[Env] = None,
-        cwd: Optional[Union[str, Path]] = None,
+        env: Env | None = None,
+        cwd: str | Path | None = None,
     ) -> None:
         """
         Run one of the postgres binaries, waiting for it to finish
@@ -2813,8 +2810,8 @@ class PgBin:
     def run_capture(
         self,
         command: list[str],
-        env: Optional[Env] = None,
-        cwd: Optional[str] = None,
+        env: Env | None = None,
+        cwd: str | None = None,
         with_command_header=True,
         **popen_kwargs: Any,
     ) -> str:
@@ -2941,7 +2938,7 @@ class VanillaPostgres(PgProtocol):
             conf_file.write("\n".join(hba) + "\n")
             conf_file.write(data)
 
-    def start(self, log_path: Optional[str] = None):
+    def start(self, log_path: str | None = None):
         assert not self.running
         self.running = True
 
@@ -2965,9 +2962,9 @@ class VanillaPostgres(PgProtocol):
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         if self.running:
             self.stop()
@@ -3014,9 +3011,9 @@ class RemotePostgres(PgProtocol):
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         # do nothing
         pass
@@ -3092,7 +3089,7 @@ class PSQL:
         self.path = full_path
         self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name"
 
-    async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process:
+    async def run(self, query: str | None = None) -> asyncio.subprocess.Process:
         run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url]
         if query is not None:
             run_args += ["--command", query]
@@ -3138,7 +3135,7 @@ class NeonProxy(PgProtocol):
         """All auth backends must inherit from this class"""
 
         @property
-        def default_conn_url(self) -> Optional[str]:
+        def default_conn_url(self) -> str | None:
             return None
 
         @abc.abstractmethod
@@ -3155,7 +3152,7 @@ class NeonProxy(PgProtocol):
             ]
 
     class Console(AuthBackend):
-        def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None):
+        def __init__(self, endpoint: str, fixed_rate_limit: int | None = None):
             self.endpoint = endpoint
             self.fixed_rate_limit = fixed_rate_limit
 
@@ -3183,7 +3180,7 @@ class NeonProxy(PgProtocol):
         pg_conn_url: str
 
         @property
-        def default_conn_url(self) -> Optional[str]:
+        def default_conn_url(self) -> str | None:
             return self.pg_conn_url
 
         def extra_args(self) -> list[str]:
@@ -3202,8 +3199,8 @@ class NeonProxy(PgProtocol):
         mgmt_port: int,
         external_http_port: int,
         auth_backend: NeonProxy.AuthBackend,
-        metric_collection_endpoint: Optional[str] = None,
-        metric_collection_interval: Optional[str] = None,
+        metric_collection_endpoint: str | None = None,
+        metric_collection_interval: str | None = None,
     ):
         host = "127.0.0.1"
         domain = "proxy.localtest.me"  # resolves to 127.0.0.1
@@ -3221,7 +3218,7 @@ class NeonProxy(PgProtocol):
         self.metric_collection_endpoint = metric_collection_endpoint
         self.metric_collection_interval = metric_collection_interval
         self.http_timeout_seconds = 15
-        self._popen: Optional[subprocess.Popen[bytes]] = None
+        self._popen: subprocess.Popen[bytes] | None = None
 
     def start(self) -> NeonProxy:
         assert self._popen is None
@@ -3356,9 +3353,9 @@ class NeonProxy(PgProtocol):
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         if self._popen is not None:
             self._popen.terminate()
@@ -3439,7 +3436,7 @@ class NeonAuthBroker:
         self.mgmt_port = mgmt_port
         self.auth_backend = auth_backend
         self.http_timeout_seconds = 15
-        self._popen: Optional[subprocess.Popen[bytes]] = None
+        self._popen: subprocess.Popen[bytes] | None = None
 
     def start(self) -> NeonAuthBroker:
         assert self._popen is None
@@ -3515,9 +3512,9 @@ class NeonAuthBroker:
 
     def __exit__(
         self,
-        _exc_type: Optional[type[BaseException]],
-        _exc_value: Optional[BaseException],
-        _traceback: Optional[TracebackType],
+        _exc_type: type[BaseException] | None,
+        _exc_value: BaseException | None,
+        _traceback: TracebackType | None,
     ):
         if self._popen is not None:
             self._popen.terminate()
@@ -3673,9 +3670,9 @@ class Endpoint(PgProtocol, LogUtils):
     ):
         super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres")
         self.env = env
-        self.branch_name: Optional[str] = None  # dubious
-        self.endpoint_id: Optional[str] = None  # dubious, see asserts below
-        self.pgdata_dir: Optional[Path] = None  # Path to computenode PGDATA
+        self.branch_name: str | None = None  # dubious
+        self.endpoint_id: str | None = None  # dubious, see asserts below
+        self.pgdata_dir: Path | None = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
         self.pg_port = pg_port
         self.http_port = http_port
@@ -3692,7 +3689,7 @@ class Endpoint(PgProtocol, LogUtils):
         self._running = threading.Semaphore(0)
 
     def http_client(
-        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
+        self, auth_token: str | None = None, retries: Retry | None = None
     ) -> EndpointHttpClient:
         return EndpointHttpClient(
             port=self.http_port,
@@ -3701,11 +3698,11 @@ class Endpoint(PgProtocol, LogUtils):
     def create(
         self,
         branch_name: str,
-        endpoint_id: Optional[str] = None,
+        endpoint_id: str | None = None,
         hot_standby: bool = False,
-        lsn: Optional[Lsn] = None,
-        config_lines: Optional[list[str]] = None,
-        pageserver_id: Optional[int] = None,
+        lsn: Lsn | None = None,
+        config_lines: list[str] | None = None,
+        pageserver_id: int | None = None,
         allow_multiple: bool = False,
     ) -> Endpoint:
         """
@@ -3748,11 +3745,11 @@ class Endpoint(PgProtocol, LogUtils):
 
     def start(
         self,
-        remote_ext_config: Optional[str] = None,
-        pageserver_id: Optional[int] = None,
-        safekeepers: Optional[list[int]] = None,
+        remote_ext_config: str | None = None,
+        pageserver_id: int | None = None,
+        safekeepers: list[int] | None = None,
         allow_multiple: bool = False,
-        basebackup_request_tries: Optional[int] = None,
+        basebackup_request_tries: int | None = None,
     ) -> Endpoint:
         """
         Start the Postgres instance.
@@ -3828,9 +3825,7 @@ class Endpoint(PgProtocol, LogUtils):
     def is_running(self):
         return self._running._value > 0
 
-    def reconfigure(
-        self, pageserver_id: Optional[int] = None, safekeepers: Optional[list[int]] = None
-    ):
+    def reconfigure(self, pageserver_id: int | None = None, safekeepers: list[int] | None = None):
         assert self.endpoint_id is not None
         # If `safekeepers` is not None, they are remember them as active and use
         # in the following commands.
@@ -3877,7 +3872,7 @@ class Endpoint(PgProtocol, LogUtils):
     def stop(
         self,
         mode: str = "fast",
-        sks_wait_walreceiver_gone: Optional[tuple[list[Safekeeper], TimelineId]] = None,
+        sks_wait_walreceiver_gone: tuple[list[Safekeeper], TimelineId] | None = None,
     ) -> Endpoint:
         """
         Stop the Postgres instance if it's running.
@@ -3931,14 +3926,14 @@ class Endpoint(PgProtocol, LogUtils):
     def create_start(
         self,
         branch_name: str,
-        endpoint_id: Optional[str] = None,
+        endpoint_id: str | None = None,
         hot_standby: bool = False,
-        lsn: Optional[Lsn] = None,
-        config_lines: Optional[list[str]] = None,
-        remote_ext_config: Optional[str] = None,
-        pageserver_id: Optional[int] = None,
+        lsn: Lsn | None = None,
+        config_lines: list[str] | None = None,
+        remote_ext_config: str | None = None,
+        pageserver_id: int | None = None,
         allow_multiple: bool = False,
-        basebackup_request_tries: Optional[int] = None,
+        basebackup_request_tries: int | None = None,
     ) -> Endpoint:
         """
         Create an endpoint, apply config, and start Postgres.
@@ -3967,9 +3962,9 @@ class Endpoint(PgProtocol, LogUtils):
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         self.stop()
 
@@ -3980,7 +3975,7 @@ class Endpoint(PgProtocol, LogUtils):
         assert self.pgdata_dir is not None  # please mypy
         return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
 
-    def clear_shared_buffers(self, cursor: Optional[Any] = None):
+    def clear_shared_buffers(self, cursor: Any | None = None):
         """
         Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.'
 
@@ -4003,14 +3998,14 @@ class EndpointFactory:
     def create_start(
         self,
         branch_name: str,
-        endpoint_id: Optional[str] = None,
-        tenant_id: Optional[TenantId] = None,
-        lsn: Optional[Lsn] = None,
+        endpoint_id: str | None = None,
+        tenant_id: TenantId | None = None,
+        lsn: Lsn | None = None,
         hot_standby: bool = False,
-        config_lines: Optional[list[str]] = None,
-        remote_ext_config: Optional[str] = None,
-        pageserver_id: Optional[int] = None,
-        basebackup_request_tries: Optional[int] = None,
+        config_lines: list[str] | None = None,
+        remote_ext_config: str | None = None,
+        pageserver_id: int | None = None,
+        basebackup_request_tries: int | None = None,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -4035,12 +4030,12 @@ class EndpointFactory:
     def create(
         self,
         branch_name: str,
-        endpoint_id: Optional[str] = None,
-        tenant_id: Optional[TenantId] = None,
-        lsn: Optional[Lsn] = None,
+        endpoint_id: str | None = None,
+        tenant_id: TenantId | None = None,
+        lsn: Lsn | None = None,
         hot_standby: bool = False,
-        config_lines: Optional[list[str]] = None,
-        pageserver_id: Optional[int] = None,
+        config_lines: list[str] | None = None,
+        pageserver_id: int | None = None,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -4078,7 +4073,7 @@ class EndpointFactory:
         return self
 
     def new_replica(
-        self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None
+        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
     ):
         branch_name = origin.branch_name
         assert origin in self.endpoints
@@ -4094,7 +4089,7 @@ class EndpointFactory:
         )
 
     def new_replica_start(
-        self, origin: Endpoint, endpoint_id: str, config_lines: Optional[list[str]] = None
+        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
     ):
         branch_name = origin.branch_name
         assert origin in self.endpoints
@@ -4132,7 +4127,7 @@ class Safekeeper(LogUtils):
         port: SafekeeperPort,
         id: int,
         running: bool = False,
-        extra_opts: Optional[list[str]] = None,
+        extra_opts: list[str] | None = None,
     ):
         self.env = env
         self.port = port
@@ -4158,7 +4153,7 @@ class Safekeeper(LogUtils):
         self.extra_opts = extra_opts
 
     def start(
-        self, extra_opts: Optional[list[str]] = None, timeout_in_seconds: Optional[int] = None
+        self, extra_opts: list[str] | None = None, timeout_in_seconds: int | None = None
     ) -> Safekeeper:
         if extra_opts is None:
             # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two.
@@ -4238,7 +4233,7 @@ class Safekeeper(LogUtils):
                 return res
 
     def http_client(
-        self, auth_token: Optional[str] = None, gen_sk_wide_token: bool = True
+        self, auth_token: str | None = None, gen_sk_wide_token: bool = True
     ) -> SafekeeperHttpClient:
         """
         When auth_token is None but gen_sk_wide is True creates safekeeper wide
@@ -4371,7 +4366,7 @@ class NeonBroker(LogUtils):
 
     def start(
         self,
-        timeout_in_seconds: Optional[int] = None,
+        timeout_in_seconds: int | None = None,
     ):
         assert not self.running
         self.env.neon_cli.storage_broker_start(timeout_in_seconds)
@@ -4394,8 +4389,7 @@ class NeonBroker(LogUtils):
         assert_no_errors(self.logfile, "storage_controller", [])
 
 
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class NodeKind(str, Enum):
+class NodeKind(StrEnum):
     PAGESERVER = "pageserver"
     SAFEKEEPER = "safekeeper"
 
@@ -4406,7 +4400,7 @@ class StorageScrubber:
         self.log_dir = log_dir
 
     def scrubber_cli(
-        self, args: list[str], timeout, extra_env: Optional[dict[str, str]] = None
+        self, args: list[str], timeout, extra_env: dict[str, str] | None = None
     ) -> str:
         assert isinstance(self.env.pageserver_remote_storage, S3Storage)
         s3_storage = self.env.pageserver_remote_storage
@@ -4469,8 +4463,8 @@ class StorageScrubber:
         self,
         post_to_storage_controller: bool = False,
         node_kind: NodeKind = NodeKind.PAGESERVER,
-        timeline_lsns: Optional[list[dict[str, Any]]] = None,
-        extra_env: Optional[dict[str, str]] = None,
+        timeline_lsns: list[dict[str, Any]] | None = None,
+        extra_env: dict[str, str] | None = None,
     ) -> tuple[bool, Any]:
         """
         Returns the health status and the metadata summary.
@@ -4504,8 +4498,8 @@ class StorageScrubber:
     def pageserver_physical_gc(
         self,
         min_age_secs: int,
-        tenant_ids: Optional[list[TenantId]] = None,
-        mode: Optional[str] = None,
+        tenant_ids: list[TenantId] | None = None,
+        mode: str | None = None,
     ):
         args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
 
@@ -4619,7 +4613,7 @@ def check_restored_datadir_content(
     test_output_dir: Path,
     env: NeonEnv,
     endpoint: Endpoint,
-    ignored_files: Optional[list[str]] = None,
+    ignored_files: list[str] | None = None,
 ):
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
 
@@ -4721,7 +4715,7 @@ def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> L
 
 
 def tenant_get_shards(
-    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None
+    env: NeonEnv, tenant_id: TenantId, pageserver_id: int | None = None
 ) -> list[tuple[TenantShardId, NeonPageserver]]:
     """
     Helper for when you want to talk to one or more pageservers, and the
@@ -4784,8 +4778,8 @@ def wait_for_last_flush_lsn(
     endpoint: Endpoint,
     tenant: TenantId,
     timeline: TimelineId,
-    pageserver_id: Optional[int] = None,
-    auth_token: Optional[str] = None,
+    pageserver_id: int | None = None,
+    auth_token: str | None = None,
 ) -> Lsn:
     """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
 
@@ -4814,7 +4808,7 @@ def flush_ep_to_pageserver(
     ep: Endpoint,
     tenant: TenantId,
     timeline: TimelineId,
-    pageserver_id: Optional[int] = None,
+    pageserver_id: int | None = None,
 ) -> Lsn:
     """
     Stop endpoint and wait until all committed WAL reaches the pageserver
@@ -4857,7 +4851,7 @@ def wait_for_wal_insert_lsn(
     endpoint: Endpoint,
     tenant: TenantId,
     timeline: TimelineId,
-    pageserver_id: Optional[int] = None,
+    pageserver_id: int | None = None,
 ) -> Lsn:
     """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
     last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
@@ -4878,7 +4872,7 @@ def fork_at_current_lsn(
     endpoint: Endpoint,
     new_branch_name: str,
     ancestor_branch_name: str,
-    tenant_id: Optional[TenantId] = None,
+    tenant_id: TenantId | None = None,
 ) -> TimelineId:
     """
     Create new branch at the last LSN of an existing branch.
@@ -4951,8 +4945,8 @@ def last_flush_lsn_upload(
     endpoint: Endpoint,
     tenant_id: TenantId,
     timeline_id: TimelineId,
-    pageserver_id: Optional[int] = None,
-    auth_token: Optional[str] = None,
+    pageserver_id: int | None = None,
+    auth_token: str | None = None,
 ) -> Lsn:
     """
     Wait for pageserver to catch to the latest flush LSN of given endpoint,
@@ -4987,9 +4981,9 @@ def generate_uploads_and_deletions(
     env: NeonEnv,
     *,
     init: bool = True,
-    tenant_id: Optional[TenantId] = None,
-    timeline_id: Optional[TimelineId] = None,
-    data: Optional[str] = None,
+    tenant_id: TenantId | None = None,
+    timeline_id: TimelineId | None = None,
+    data: str | None = None,
     pageserver: NeonPageserver,
 ):
     """
diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py
index 2319701e0b..0e068db593 100644
--- a/test_runner/fixtures/pageserver/common_types.py
+++ b/test_runner/fixtures/pageserver/common_types.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 import re
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING
 
 from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn
 
@@ -46,7 +46,7 @@ class DeltaLayerName:
         return ret
 
 
-LayerName = Union[ImageLayerName, DeltaLayerName]
+LayerName = ImageLayerName | DeltaLayerName
 
 
 class InvalidFileName(Exception):
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 01583757fa..98330ba350 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -4,7 +4,7 @@ import time
 from collections import defaultdict
 from dataclasses import dataclass
 from datetime import datetime
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import requests
 from requests.adapters import HTTPAdapter
@@ -16,9 +16,6 @@ from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
 from fixtures.utils import Fn
 
-if TYPE_CHECKING:
-    from typing import Optional, Union
-
 
 class PageserverApiException(Exception):
     def __init__(self, message, status_code: int):
@@ -43,7 +40,7 @@ class TimelineCreate409(PageserverApiException):
 class InMemoryLayerInfo:
     kind: str
     lsn_start: str
-    lsn_end: Optional[str]
+    lsn_end: str | None
 
     @classmethod
     def from_json(cls, d: dict[str, Any]) -> InMemoryLayerInfo:
@@ -60,10 +57,10 @@ class HistoricLayerInfo:
     layer_file_name: str
     layer_file_size: int
     lsn_start: str
-    lsn_end: Optional[str]
+    lsn_end: str | None
     remote: bool
     # None for image layers, true if pageserver thinks this is an L0 delta layer
-    l0: Optional[bool]
+    l0: bool | None
     visible: bool
 
     @classmethod
@@ -180,8 +177,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self,
         port: int,
         is_testing_enabled_or_skip: Fn,
-        auth_token: Optional[str] = None,
-        retries: Optional[Retry] = None,
+        auth_token: str | None = None,
+        retries: Retry | None = None,
     ):
         super().__init__()
         self.port = port
@@ -278,7 +275,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def tenant_attach(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         generation: int,
         config: None | dict[str, Any] = None,
     ):
@@ -305,7 +302,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             },
         )
 
-    def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
+    def tenant_reset(self, tenant_id: TenantId | TenantShardId, drop_cache: bool):
         params = {}
         if drop_cache:
             params["drop_cache"] = "true"
@@ -315,10 +312,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def tenant_location_conf(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         location_conf: dict[str, Any],
         flush_ms=None,
-        lazy: Optional[bool] = None,
+        lazy: bool | None = None,
     ):
         body = location_conf.copy()
 
@@ -353,13 +350,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return res.json()
 
-    def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
+    def tenant_delete(self, tenant_id: TenantId | TenantShardId):
         res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
         return res
 
     def tenant_status(
-        self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
+        self, tenant_id: TenantId | TenantShardId, activate: bool = False
     ) -> dict[Any, Any]:
         """
         :activate: hint the server not to accelerate activation of this tenant in response
@@ -378,17 +375,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
-    def tenant_config(self, tenant_id: Union[TenantId, TenantShardId]) -> TenantConfig:
+    def tenant_config(self, tenant_id: TenantId | TenantShardId) -> TenantConfig:
         res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
         self.verbose_error(res)
         return TenantConfig.from_json(res.json())
 
-    def tenant_heatmap_upload(self, tenant_id: Union[TenantId, TenantShardId]):
+    def tenant_heatmap_upload(self, tenant_id: TenantId | TenantShardId):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
         self.verbose_error(res)
 
     def tenant_secondary_download(
-        self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None
+        self, tenant_id: TenantId | TenantShardId, wait_ms: int | None = None
     ) -> tuple[int, dict[Any, Any]]:
         url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download"
         if wait_ms is not None:
@@ -397,13 +394,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return (res.status_code, res.json())
 
-    def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]):
+    def tenant_secondary_status(self, tenant_id: TenantId | TenantShardId):
         url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status"
         res = self.get(url)
         self.verbose_error(res)
         return res.json()
 
-    def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
+    def set_tenant_config(self, tenant_id: TenantId | TenantShardId, config: dict[str, Any]):
         """
         Only use this via storage_controller.pageserver_api().
 
@@ -420,8 +417,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
     def patch_tenant_config_client_side(
         self,
         tenant_id: TenantId,
-        inserts: Optional[dict[str, Any]] = None,
-        removes: Optional[list[str]] = None,
+        inserts: dict[str, Any] | None = None,
+        removes: list[str] | None = None,
     ):
         """
         Only use this via storage_controller.pageserver_api().
@@ -436,11 +433,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
                 del current[key]
         self.set_tenant_config(tenant_id, current)
 
-    def tenant_size(self, tenant_id: Union[TenantId, TenantShardId]) -> int:
+    def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
         return self.tenant_size_and_modelinputs(tenant_id)[0]
 
     def tenant_size_and_modelinputs(
-        self, tenant_id: Union[TenantId, TenantShardId]
+        self, tenant_id: TenantId | TenantShardId
     ) -> tuple[int, dict[str, Any]]:
         """
         Returns the tenant size, together with the model inputs as the second tuple item.
@@ -456,7 +453,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(inputs, dict)
         return (size, inputs)
 
-    def tenant_size_debug(self, tenant_id: Union[TenantId, TenantShardId]) -> str:
+    def tenant_size_debug(self, tenant_id: TenantId | TenantShardId) -> str:
         """
         Returns the tenant size debug info, as an HTML string
         """
@@ -468,10 +465,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def tenant_time_travel_remote_storage(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timestamp: datetime,
         done_if_after: datetime,
-        shard_counts: Optional[list[int]] = None,
+        shard_counts: list[int] | None = None,
     ):
         """
         Issues a request to perform time travel operations on the remote storage
@@ -490,7 +487,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_list(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         include_non_incremental_logical_size: bool = False,
         include_timeline_dir_layer_file_size_sum: bool = False,
     ) -> list[dict[str, Any]]:
@@ -510,7 +507,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_and_offloaded_list(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
     ) -> TimelinesInfoAndOffloaded:
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded",
@@ -523,11 +520,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
     def timeline_create(
         self,
         pg_version: PgVersion,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         new_timeline_id: TimelineId,
-        ancestor_timeline_id: Optional[TimelineId] = None,
-        ancestor_start_lsn: Optional[Lsn] = None,
-        existing_initdb_timeline_id: Optional[TimelineId] = None,
+        ancestor_timeline_id: TimelineId | None = None,
+        ancestor_start_lsn: Lsn | None = None,
+        existing_initdb_timeline_id: TimelineId | None = None,
         **kwargs,
     ) -> dict[Any, Any]:
         body: dict[str, Any] = {
@@ -558,7 +555,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_detail(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         include_non_incremental_logical_size: bool = False,
         include_timeline_dir_layer_file_size_sum: bool = False,
@@ -584,7 +581,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def timeline_delete(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, **kwargs
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, **kwargs
     ):
         """
         Note that deletion is not instant, it is scheduled and performed mostly in the background.
@@ -600,9 +597,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_gc(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
-        gc_horizon: Optional[int],
+        gc_horizon: int | None,
     ) -> dict[str, Any]:
         """
         Unlike most handlers, this will wait for the layers to be actually
@@ -624,16 +621,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
-    def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
+    def timeline_block_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId):
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc",
         )
         log.info(f"Got GC request response code: {res.status_code}")
         self.verbose_error(res)
 
-    def timeline_unblock_gc(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ):
+    def timeline_unblock_gc(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId):
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc",
         )
@@ -642,7 +637,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_offload(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
     ):
         self.is_testing_enabled_or_skip()
@@ -658,14 +653,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_compact(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
         force_l0_compaction=False,
         wait_until_uploaded=False,
         enhanced_gc_bottom_most_compaction=False,
-        body: Optional[dict[str, Any]] = None,
+        body: dict[str, Any] | None = None,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -692,7 +687,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert res_json is None
 
     def timeline_preserve_initdb_archive(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId
     ):
         log.info(
             f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}"
@@ -704,7 +699,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_archival_config(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         state: TimelineArchivalState,
     ):
@@ -720,7 +715,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_get_lsn_by_timestamp(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         timestamp: datetime,
         with_lease: bool = False,
@@ -739,7 +734,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def timeline_lsn_lease(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
     ):
         data = {
             "lsn": str(lsn),
@@ -755,7 +750,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def timeline_get_timestamp_of_lsn(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
     ):
         log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}")
         res = self.get(
@@ -765,9 +760,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res_json = res.json()
         return res_json
 
-    def timeline_layer_map_info(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ):
+    def timeline_layer_map_info(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId):
         log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}")
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer",
@@ -778,13 +771,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_checkpoint(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
         force_l0_compaction=False,
         wait_until_uploaded=False,
-        compact: Optional[bool] = None,
+        compact: bool | None = None,
         **kwargs,
     ):
         self.is_testing_enabled_or_skip()
@@ -814,7 +807,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_spawn_download_remote_layers(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         max_concurrent_downloads: int,
     ) -> dict[str, Any]:
@@ -833,7 +826,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_poll_download_remote_layers_status(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         spawn_response: dict[str, Any],
         poll_state=None,
@@ -855,7 +848,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def timeline_download_remote_layers(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         max_concurrent_downloads: int,
         errors_ok=False,
@@ -905,7 +898,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         file_kind: str,
         op_kind: str,
-    ) -> Optional[int]:
+    ) -> int | None:
         metrics = [
             "pageserver_remote_timeline_client_calls_started_total",
             "pageserver_remote_timeline_client_calls_finished_total",
@@ -929,7 +922,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def layer_map_info(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
     ) -> LayerMapInfo:
         res = self.get(
@@ -939,7 +932,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return LayerMapInfo.from_json(res.json())
 
     def timeline_layer_scan_disposable_keys(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str
     ) -> ScanDisposableKeysResponse:
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys",
@@ -949,7 +942,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return ScanDisposableKeysResponse.from_json(res.json())
 
     def download_layer(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str
     ):
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
@@ -958,9 +951,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
         assert res.status_code == 200
 
-    def download_all_layers(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ):
+    def download_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId):
         info = self.layer_map_info(tenant_id, timeline_id)
         for layer in info.historic_layers:
             if not layer.remote:
@@ -969,9 +960,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def detach_ancestor(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
-        batch_size: Optional[int] = None,
+        batch_size: int | None = None,
         **kwargs,
     ) -> set[TimelineId]:
         params = {}
@@ -987,7 +978,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return set(map(TimelineId, json["reparented_timelines"]))
 
     def evict_layer(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, layer_name: str
     ):
         res = self.delete(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
@@ -996,7 +987,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
         assert res.status_code in (200, 304)
 
-    def evict_all_layers(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
+    def evict_all_layers(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId):
         info = self.layer_map_info(tenant_id, timeline_id)
         for layer in info.historic_layers:
             self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
@@ -1009,7 +1000,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return res.json()
 
-    def tenant_break(self, tenant_id: Union[TenantId, TenantShardId]):
+    def tenant_break(self, tenant_id: TenantId | TenantShardId):
         res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
         self.verbose_error(res)
 
@@ -1058,7 +1049,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def perf_info(
         self,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
     ):
         self.is_testing_enabled_or_skip()
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 37b4246d40..b6d19af84c 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -13,7 +13,8 @@ from fixtures.neon_fixtures import (
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 
 if TYPE_CHECKING:
-    from typing import Any, Callable
+    from collections.abc import Callable
+    from typing import Any
 
 
 def single_timeline(
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index ac7497ee6c..46700e3fe3 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -17,14 +17,14 @@ from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
 from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
-    from typing import Any, Optional, Union
+    from typing import Any
 
 
 def assert_tenant_state(
     pageserver_http: PageserverHttpClient,
     tenant: TenantId,
     expected_state: str,
-    message: Optional[str] = None,
+    message: str | None = None,
 ) -> None:
     tenant_status = pageserver_http.tenant_status(tenant)
     log.info(f"tenant_status: {tenant_status}")
@@ -33,7 +33,7 @@ def assert_tenant_state(
 
 def remote_consistent_lsn(
     pageserver_http: PageserverHttpClient,
-    tenant: Union[TenantId, TenantShardId],
+    tenant: TenantId | TenantShardId,
     timeline: TimelineId,
 ) -> Lsn:
     detail = pageserver_http.timeline_detail(tenant, timeline)
@@ -51,7 +51,7 @@ def remote_consistent_lsn(
 
 def wait_for_upload(
     pageserver_http: PageserverHttpClient,
-    tenant: Union[TenantId, TenantShardId],
+    tenant: TenantId | TenantShardId,
     timeline: TimelineId,
     lsn: Lsn,
 ):
@@ -138,7 +138,7 @@ def wait_until_all_tenants_state(
 
 def wait_until_timeline_state(
     pageserver_http: PageserverHttpClient,
-    tenant_id: Union[TenantId, TenantShardId],
+    tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
     expected_state: str,
     iterations: int,
@@ -188,7 +188,7 @@ def wait_until_tenant_active(
 
 def last_record_lsn(
     pageserver_http_client: PageserverHttpClient,
-    tenant: Union[TenantId, TenantShardId],
+    tenant: TenantId | TenantShardId,
     timeline: TimelineId,
 ) -> Lsn:
     detail = pageserver_http_client.timeline_detail(tenant, timeline)
@@ -200,7 +200,7 @@ def last_record_lsn(
 
 def wait_for_last_record_lsn(
     pageserver_http: PageserverHttpClient,
-    tenant: Union[TenantId, TenantShardId],
+    tenant: TenantId | TenantShardId,
     timeline: TimelineId,
     lsn: Lsn,
 ) -> Lsn:
@@ -267,10 +267,10 @@ def wait_for_upload_queue_empty(
 
 def wait_timeline_detail_404(
     pageserver_http: PageserverHttpClient,
-    tenant_id: Union[TenantId, TenantShardId],
+    tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
     iterations: int,
-    interval: Optional[float] = None,
+    interval: float | None = None,
 ):
     if interval is None:
         interval = 0.25
@@ -292,10 +292,10 @@ def wait_timeline_detail_404(
 
 def timeline_delete_wait_completed(
     pageserver_http: PageserverHttpClient,
-    tenant_id: Union[TenantId, TenantShardId],
+    tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
     iterations: int = 20,
-    interval: Optional[float] = None,
+    interval: float | None = None,
     **delete_args,
 ) -> None:
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
@@ -304,9 +304,9 @@ def timeline_delete_wait_completed(
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
 def assert_prefix_empty(
-    remote_storage: Optional[RemoteStorage],
-    prefix: Optional[str] = None,
-    allowed_postfix: Optional[str] = None,
+    remote_storage: RemoteStorage | None,
+    prefix: str | None = None,
+    allowed_postfix: str | None = None,
     delimiter: str = "/",
 ) -> None:
     assert remote_storage is not None
@@ -348,8 +348,8 @@ def assert_prefix_empty(
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
 def assert_prefix_not_empty(
-    remote_storage: Optional[RemoteStorage],
-    prefix: Optional[str] = None,
+    remote_storage: RemoteStorage | None,
+    prefix: str | None = None,
     delimiter: str = "/",
 ):
     assert remote_storage is not None
@@ -358,7 +358,7 @@ def assert_prefix_not_empty(
 
 
 def list_prefix(
-    remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/"
+    remote: RemoteStorage, prefix: str | None = None, delimiter: str = "/"
 ) -> ListObjectsV2OutputTypeDef:
     """
     Note that this function takes into account prefix_in_bucket.
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 1131bf090f..0286b4f036 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -11,7 +11,7 @@ from _pytest.python import Metafunc
 from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 """
@@ -20,31 +20,31 @@ Dynamically parametrize tests by different parameters
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pg_version() -> Optional[PgVersion]:
+def pg_version() -> PgVersion | None:
     return None
 
 
 @pytest.fixture(scope="function", autouse=True)
-def build_type() -> Optional[str]:
+def build_type() -> str | None:
     return None
 
 
 @pytest.fixture(scope="session", autouse=True)
-def platform() -> Optional[str]:
+def platform() -> str | None:
     return None
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pageserver_virtual_file_io_engine() -> Optional[str]:
+def pageserver_virtual_file_io_engine() -> str | None:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pageserver_virtual_file_io_mode() -> Optional[str]:
+def pageserver_virtual_file_io_mode() -> str | None:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
 
 
-def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]:
+def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
     toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
     if toml_table is None:
         return None
@@ -54,7 +54,7 @@ def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]:
+def pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
     return get_pageserver_default_tenant_config_compaction_algorithm()
 
 
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index 60221573eb..1c71abea19 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -18,7 +18,6 @@ from fixtures.utils import allure_attach_from_dir
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
-    from typing import Optional
 
 
 BASE_DIR = Path(__file__).parents[2]
@@ -26,9 +25,7 @@ COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc"
 DEFAULT_OUTPUT_DIR: str = "test_output"
 
 
-def get_test_dir(
-    request: FixtureRequest, top_output_dir: Path, prefix: Optional[str] = None
-) -> Path:
+def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path:
     """Compute the path to a working directory for an individual test."""
     test_name = request.node.name
     test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}"
@@ -112,7 +109,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]:
 
 
 @pytest.fixture(scope="session")
-def compatibility_neon_binpath() -> Iterator[Optional[Path]]:
+def compatibility_neon_binpath() -> Iterator[Path | None]:
     if os.getenv("REMOTE_ENV"):
         return
     comp_binpath = None
@@ -133,7 +130,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
 
 
 @pytest.fixture(scope="session")
-def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]:
+def compatibility_pg_distrib_dir() -> Iterator[Path | None]:
     compat_distrib_dir = None
     if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"):
         compat_distrib_dir = Path(env_compat_postgres_bin).resolve()
@@ -197,7 +194,7 @@ class FileAndThreadLock:
     def __init__(self, path: Path):
         self.path = path
         self.thread_lock = threading.Lock()
-        self.fd: Optional[int] = None
+        self.fd: int | None = None
 
     def __enter__(self):
         self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY)
@@ -208,9 +205,9 @@ class FileAndThreadLock:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        exc_traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
     ):
         assert self.fd is not None
         assert self.thread_lock.locked()  # ... by us
@@ -263,9 +260,9 @@ class SnapshotDir:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        exc_traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
     ):
         self._lock.__exit__(exc_type, exc_value, exc_traceback)
 
@@ -277,7 +274,7 @@ def shared_snapshot_dir(top_output_dir: Path, ident: str) -> SnapshotDir:
 
 
 @pytest.fixture(scope="function")
-def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]:
+def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path | None:
     """
     Idempotently create a test's overlayfs mount state directory.
     If the functionality isn't enabled via env var, returns None.
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 798db1e8d9..46423e8c76 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -1,22 +1,16 @@
 from __future__ import annotations
 
-import enum
-from typing import TYPE_CHECKING
+from enum import StrEnum
 
 from typing_extensions import override
 
-if TYPE_CHECKING:
-    from typing import Optional
-
-
 """
 This fixture is used to determine which version of Postgres to use for tests.
 """
 
 
 # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument
-# TODO: use enum.StrEnum for Python >= 3.11
-class PgVersion(str, enum.Enum):
+class PgVersion(StrEnum):
     V14 = "14"
     V15 = "15"
     V16 = "16"
@@ -34,7 +28,6 @@ class PgVersion(str, enum.Enum):
     def __repr__(self) -> str:
         return f"'{self.value}'"
 
-    # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums
     @override
     def __str__(self) -> str:
         return self.value
@@ -47,16 +40,18 @@ class PgVersion(str, enum.Enum):
 
     @classmethod
     @override
-    def _missing_(cls, value: object) -> Optional[PgVersion]:
-        known_values = {v.value for _, v in cls.__members__.items()}
+    def _missing_(cls, value: object) -> PgVersion | None:
+        if not isinstance(value, str):
+            return None
 
-        # Allow passing version as a string with "v" prefix (e.g. "v14")
-        if isinstance(value, str) and value.lower().startswith("v") and value[1:] in known_values:
-            return cls(value[1:])
-        # Allow passing version as an int (e.g. 15 or 150002, both will be converted to PgVersion.V15)
-        elif isinstance(value, int) and str(value)[:2] in known_values:
-            return cls(str(value)[:2])
+        known_values = set(cls.__members__.values())
+
+        # Allow passing version as v-prefixed string (e.g. "v14")
+        if value.lower().startswith("v") and (v := value[1:]) in known_values:
+            return cls(v)
+
+        # Allow passing version as an int (i.e. both "15" and "150002" matches PgVersion.V15)
+        if value.isdigit() and (v := value[:2]) in known_values:
+            return cls(v)
 
-        # Make mypy happy
-        # See https://github.com/python/mypy/issues/3974
         return None
diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py
index df0eb2a809..6a829a9399 100644
--- a/test_runner/fixtures/port_distributor.py
+++ b/test_runner/fixtures/port_distributor.py
@@ -3,13 +3,9 @@ from __future__ import annotations
 import re
 import socket
 from contextlib import closing
-from typing import TYPE_CHECKING
 
 from fixtures.log_helper import log
 
-if TYPE_CHECKING:
-    from typing import Union
-
 
 def can_bind(host: str, port: int) -> bool:
     """
@@ -49,17 +45,19 @@ class PortDistributor:
             "port range configured for test is exhausted, consider enlarging the range"
         )
 
-    def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]:
+    def replace_with_new_port(self, value: int | str) -> int | str:
         """
         Returns a new port for a port number in a string (like "localhost:1234") or int.
         Replacements are memorised, so a substitution for the same port is always the same.
         """
 
-        # TODO: replace with structural pattern matching for Python >= 3.10
-        if isinstance(value, int):
-            return self._replace_port_int(value)
-
-        return self._replace_port_str(value)
+        match value:
+            case int():
+                return self._replace_port_int(value)
+            case str():
+                return self._replace_port_str(value)
+            case _:
+                raise TypeError(f"Unsupported type {type(value)}, should be int | str")
 
     def _replace_port_int(self, value: int) -> int:
         known_port = self.port_map.get(value)
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index c630ea98b4..4e1e8a884f 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -6,8 +6,9 @@ import json
 import os
 import re
 from dataclasses import dataclass
+from enum import StrEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING
 
 import boto3
 import toml
@@ -20,7 +21,7 @@ from fixtures.log_helper import log
 from fixtures.pageserver.common_types import IndexPartDump
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
@@ -28,7 +29,7 @@ TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
 
 
 @enum.unique
-class RemoteStorageUser(str, enum.Enum):
+class RemoteStorageUser(StrEnum):
     """
     Instead of using strings for the users, use a more strict enum.
     """
@@ -77,21 +78,19 @@ class MockS3Server:
 class LocalFsStorage:
     root: Path
 
-    def tenant_path(self, tenant_id: Union[TenantId, TenantShardId]) -> Path:
+    def tenant_path(self, tenant_id: TenantId | TenantShardId) -> Path:
         return self.root / "tenants" / str(tenant_id)
 
-    def timeline_path(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ) -> Path:
+    def timeline_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path:
         return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
 
     def timeline_latest_generation(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ) -> Optional[int]:
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId
+    ) -> int | None:
         timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id))
         index_parts = [f for f in timeline_files if f.startswith("index_part")]
 
-        def parse_gen(filename: str) -> Optional[int]:
+        def parse_gen(filename: str) -> int | None:
             log.info(f"parsing index_part '{filename}'")
             parts = filename.split("-")
             if len(parts) == 2:
@@ -104,9 +103,7 @@ class LocalFsStorage:
             raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}")
         return generations[-1]
 
-    def index_path(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ) -> Path:
+    def index_path(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Path:
         latest_gen = self.timeline_latest_generation(tenant_id, timeline_id)
         if latest_gen is None:
             filename = TIMELINE_INDEX_PART_FILE_NAME
@@ -120,7 +117,7 @@ class LocalFsStorage:
         tenant_id: TenantId,
         timeline_id: TimelineId,
         local_name: str,
-        generation: Optional[int] = None,
+        generation: int | None = None,
     ):
         if generation is None:
             generation = self.timeline_latest_generation(tenant_id, timeline_id)
@@ -130,9 +127,7 @@ class LocalFsStorage:
         filename = f"{local_name}-{generation:08x}"
         return self.timeline_path(tenant_id, timeline_id) / filename
 
-    def index_content(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ) -> Any:
+    def index_content(self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId) -> Any:
         with self.index_path(tenant_id, timeline_id).open("r") as f:
             return json.load(f)
 
@@ -164,17 +159,17 @@ class LocalFsStorage:
 class S3Storage:
     bucket_name: str
     bucket_region: str
-    access_key: Optional[str]
-    secret_key: Optional[str]
-    aws_profile: Optional[str]
+    access_key: str | None
+    secret_key: str | None
+    aws_profile: str | None
     prefix_in_bucket: str
     client: S3Client
     cleanup: bool
     """Is this MOCK_S3 (false) or REAL_S3 (true)"""
     real: bool
-    endpoint: Optional[str] = None
+    endpoint: str | None = None
     """formatting deserialized with humantime crate, for example "1s"."""
-    custom_timeout: Optional[str] = None
+    custom_timeout: str | None = None
 
     def access_env_vars(self) -> dict[str, str]:
         if self.aws_profile is not None:
@@ -272,12 +267,10 @@ class S3Storage:
     def tenants_path(self) -> str:
         return f"{self.prefix_in_bucket}/tenants"
 
-    def tenant_path(self, tenant_id: Union[TenantShardId, TenantId]) -> str:
+    def tenant_path(self, tenant_id: TenantShardId | TenantId) -> str:
         return f"{self.tenants_path()}/{tenant_id}"
 
-    def timeline_path(
-        self, tenant_id: Union[TenantShardId, TenantId], timeline_id: TimelineId
-    ) -> str:
+    def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
         return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
 
     def get_latest_index_key(self, index_keys: list[str]) -> str:
@@ -315,11 +308,11 @@ class S3Storage:
         assert self.real is False
 
 
-RemoteStorage = Union[LocalFsStorage, S3Storage]
+RemoteStorage = LocalFsStorage | S3Storage
 
 
 @enum.unique
-class RemoteStorageKind(str, enum.Enum):
+class RemoteStorageKind(StrEnum):
     LOCAL_FS = "local_fs"
     MOCK_S3 = "mock_s3"
     REAL_S3 = "real_s3"
@@ -331,8 +324,8 @@ class RemoteStorageKind(str, enum.Enum):
         run_id: str,
         test_name: str,
         user: RemoteStorageUser,
-        bucket_name: Optional[str] = None,
-        bucket_region: Optional[str] = None,
+        bucket_name: str | None = None,
+        bucket_region: str | None = None,
     ) -> RemoteStorage:
         if self == RemoteStorageKind.LOCAL_FS:
             return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user))
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 5d9a3bd149..094188c0b5 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -13,7 +13,7 @@ from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
-    from typing import Any, Optional, Union
+    from typing import Any
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
@@ -72,7 +72,7 @@ class TermBumpResponse:
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
-    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
+    def __init__(self, port: int, auth_token: str | None = None, is_testing_enabled=False):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
@@ -98,7 +98,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         if not self.is_testing_enabled:
             pytest.skip("safekeeper was built without 'testing' feature")
 
-    def configure_failpoints(self, config_strings: Union[tuple[str, str], list[tuple[str, str]]]):
+    def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]):
         self.is_testing_enabled_or_skip()
 
         if isinstance(config_strings, tuple):
@@ -195,7 +195,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
-    def debug_dump(self, params: Optional[dict[str, str]] = None) -> dict[str, Any]:
+    def debug_dump(self, params: dict[str, str] | None = None) -> dict[str, Any]:
         params = params or {}
         res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
         res.raise_for_status()
@@ -204,7 +204,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def debug_dump_timeline(
-        self, timeline_id: TimelineId, params: Optional[dict[str, str]] = None
+        self, timeline_id: TimelineId, params: dict[str, str] | None = None
     ) -> Any:
         params = params or {}
         params["timeline_id"] = str(timeline_id)
@@ -285,7 +285,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        term: Optional[int],
+        term: int | None,
     ) -> TermBumpResponse:
         body = {}
         if term is not None:
diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py
index c174358ef5..be95a98ff9 100644
--- a/test_runner/fixtures/storage_controller_proxy.py
+++ b/test_runner/fixtures/storage_controller_proxy.py
@@ -13,14 +13,14 @@ from werkzeug.wrappers.response import Response
 from fixtures.log_helper import log
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 class StorageControllerProxy:
     def __init__(self, server: HTTPServer):
         self.server: HTTPServer = server
         self.listen: str = f"http://{server.host}:{server.port}"
-        self.routing_to: Optional[str] = None
+        self.routing_to: str | None = None
 
     def route_to(self, storage_controller_api: str):
         self.routing_to = storage_controller_api
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index bb45385ea6..010801be6c 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -8,10 +8,10 @@ import subprocess
 import tarfile
 import threading
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from hashlib import sha256
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 from urllib.parse import urlencode
 
 import allure
@@ -29,7 +29,7 @@ from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
-    from typing import IO, Optional
+    from typing import IO
 
     from fixtures.common_types import TimelineId
     from fixtures.neon_fixtures import PgBin
@@ -66,10 +66,10 @@ def subprocess_capture(
     echo_stderr: bool = False,
     echo_stdout: bool = False,
     capture_stdout: bool = False,
-    timeout: Optional[float] = None,
+    timeout: float | None = None,
     with_command_header: bool = True,
     **popen_kwargs: Any,
-) -> tuple[str, Optional[str], int]:
+) -> tuple[str, str | None, int]:
     """Run a process and bifurcate its output to files and the `log` logger
 
     stderr and stdout are always captured in files.  They are also optionally
@@ -536,7 +536,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str
     """
     started_at = time.time()
 
-    def hash_extracted(reader: Optional[IO[bytes]]) -> bytes:
+    def hash_extracted(reader: IO[bytes] | None) -> bytes:
         assert reader is not None
         digest = sha256(usedforsecurity=False)
         while True:
@@ -563,7 +563,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str
 
     mismatching: set[str] = set()
 
-    for left_tuple, right_tuple in zip(left_list, right_list):
+    for left_tuple, right_tuple in zip(left_list, right_list, strict=False):
         left_path, left_hash = left_tuple
         right_path, right_hash = right_tuple
         assert (
@@ -595,7 +595,7 @@ class PropagatingThread(threading.Thread):
             self.exc = e
 
     @override
-    def join(self, timeout: Optional[float] = None) -> Any:
+    def join(self, timeout: float | None = None) -> Any:
         super().join(timeout)
         if self.exc:
             raise self.exc
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index e869c43185..639e60914a 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -15,7 +15,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_for_last_record_lsn
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
 # to ensure we don't do that: this enables running lots of Workloads in parallel safely.
@@ -36,8 +36,8 @@ class Workload:
         env: NeonEnv,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        branch_name: Optional[str] = None,
-        endpoint_opts: Optional[dict[str, Any]] = None,
+        branch_name: str | None = None,
+        endpoint_opts: dict[str, Any] | None = None,
     ):
         self.env = env
         self.tenant_id = tenant_id
@@ -50,7 +50,7 @@ class Workload:
         self.expect_rows = 0
         self.churn_cursor = 0
 
-        self._endpoint: Optional[Endpoint] = None
+        self._endpoint: Endpoint | None = None
         self._endpoint_opts = endpoint_opts or {}
 
     def reconfigure(self):
@@ -61,7 +61,7 @@ class Workload:
             with ENDPOINT_LOCK:
                 self._endpoint.reconfigure()
 
-    def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
+    def endpoint(self, pageserver_id: int | None = None) -> Endpoint:
         # We may be running alongside other Workloads for different tenants.  Full TTID is
         # obnoxiously long for use here, but a cut-down version is still unique enough for tests.
         endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
@@ -94,7 +94,7 @@ class Workload:
     def __del__(self):
         self.stop()
 
-    def init(self, pageserver_id: Optional[int] = None):
+    def init(self, pageserver_id: int | None = None):
         endpoint = self.endpoint(pageserver_id)
 
         endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
@@ -103,7 +103,7 @@ class Workload:
             self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
         )
 
-    def write_rows(self, n: int, pageserver_id: Optional[int] = None, upload: bool = True):
+    def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True):
         endpoint = self.endpoint(pageserver_id)
         start = self.expect_rows
         end = start + n - 1
@@ -125,7 +125,7 @@ class Workload:
             return False
 
     def churn_rows(
-        self, n: int, pageserver_id: Optional[int] = None, upload: bool = True, ingest: bool = True
+        self, n: int, pageserver_id: int | None = None, upload: bool = True, ingest: bool = True
     ):
         assert self.expect_rows >= n
 
@@ -190,7 +190,7 @@ class Workload:
                 else:
                     log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
 
-    def validate(self, pageserver_id: Optional[int] = None):
+    def validate(self, pageserver_id: int | None = None):
         endpoint = self.endpoint(pageserver_id)
         endpoint.clear_shared_buffers()
         result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}")
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 227319c425..bcc3db69f0 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -16,7 +16,8 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_until_all_tenants_state
 
 if TYPE_CHECKING:
-    from typing import Any, Callable, Optional
+    from collections.abc import Callable
+    from typing import Any
 
 
 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
@@ -46,7 +47,7 @@ def setup_pageserver_with_tenants(
     name: str,
     n_tenants: int,
     setup: Callable[[NeonEnv], tuple[TenantId, TimelineId, dict[str, Any]]],
-    timeout_in_seconds: Optional[int] = None,
+    timeout_in_seconds: int | None = None,
 ) -> NeonEnv:
     """
     Utility function to set up a pageserver with a given number of identical tenants.
diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py
index d571fab6b5..0e56fdc96f 100644
--- a/test_runner/performance/test_copy.py
+++ b/test_runner/performance/test_copy.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 from contextlib import closing
 from io import BufferedReader, RawIOBase
-from typing import Optional, final
+from typing import final
 
 from fixtures.compare_fixtures import PgCompare
 from typing_extensions import override
@@ -13,7 +13,7 @@ class CopyTestData(RawIOBase):
     def __init__(self, rows: int):
         self.rows = rows
         self.rownum = 0
-        self.linebuf: Optional[bytes] = None
+        self.linebuf: bytes | None = None
         self.ptr = 0
 
     @override
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index d56f6dce09..38b04b9114 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -18,7 +18,7 @@ from fixtures.neon_api import connection_parameters_to_env
 from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
     from fixtures.benchmark_fixture import NeonBenchmarker
     from fixtures.neon_api import NeonAPI
@@ -247,7 +247,7 @@ def test_replication_start_stop(
             ],
             env=master_env,
         )
-        replica_pgbench: list[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)]
+        replica_pgbench: list[subprocess.Popen[Any] | None] = [None] * num_replicas
 
         # Use the bits of iconfig to tell us which configuration we are on. For example
         # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index dc051483f8..142bd3d669 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -4,7 +4,7 @@ import concurrent.futures
 import random
 import time
 from collections import defaultdict
-from enum import Enum
+from enum import StrEnum
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
@@ -139,7 +139,7 @@ def test_storage_controller_many_tenants(
     tenant_timelines_count = 100
 
     # These lists are maintained for use with rng.choice
-    tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count))
+    tenants_with_timelines = list(rng.sample(list(tenants.keys()), tenant_timelines_count))
     tenants_without_timelines = list(
         tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines
     )
@@ -171,7 +171,7 @@ def test_storage_controller_many_tenants(
     # start timing on test nodes if we aren't a bit careful.
     create_concurrency = 16
 
-    class Operation(str, Enum):
+    class Operation(StrEnum):
         TIMELINE_OPS = "timeline_ops"
         SHARD_MIGRATE = "shard_migrate"
         TENANT_PASSTHROUGH = "tenant_passthrough"
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 576a4f0467..c6d795ce4d 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -17,7 +17,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
 if TYPE_CHECKING:
-    from typing import Any, Callable
+    from collections.abc import Callable
+    from typing import Any
 
 
 @pytest.fixture(params=["vanilla", "neon_off", "neon_on"])
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7d19ba3b5d..5744c445f6 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 from collections.abc import Generator
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 from fixtures.common_types import TenantId
@@ -105,7 +104,7 @@ def test_null_config(negative_env: NegativeTests):
 
 
 @pytest.mark.parametrize("content_type", [None, "application/json"])
-def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]):
+def test_empty_config(positive_env: NeonEnv, content_type: str | None):
     """
     When the 'config' body attribute is omitted, the request should be accepted
     and the tenant should use the default configuration
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 48950a5a50..f71e05924a 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 
-import enum
 import json
 import time
-from typing import TYPE_CHECKING
+from enum import StrEnum
 
 import pytest
 from fixtures.log_helper import log
@@ -15,10 +14,6 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 
-if TYPE_CHECKING:
-    from typing import Optional
-
-
 AGGRESIVE_COMPACTION_TENANT_CONF = {
     # Disable gc and compaction. The test runs compaction manually.
     "gc_period": "0s",
@@ -172,7 +167,7 @@ LARGE_STRIPES = 32768
 def test_sharding_compaction(
     neon_env_builder: NeonEnvBuilder,
     stripe_size: int,
-    shard_count: Optional[int],
+    shard_count: int | None,
     gc_compaction: bool,
 ):
     """
@@ -277,7 +272,7 @@ def test_sharding_compaction(
             )
 
 
-class CompactionAlgorithm(str, enum.Enum):
+class CompactionAlgorithm(StrEnum):
     LEGACY = "legacy"
     TIERED = "tiered"
 
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 96ba3dd5a4..ba7305148f 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -7,7 +7,6 @@ import subprocess
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING
 
 import fixtures.utils
 import pytest
@@ -28,10 +27,6 @@ from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.workload import Workload
 
-if TYPE_CHECKING:
-    from typing import Optional
-
-
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
 # - `test_create_snapshot` a script wrapped in a test that creates a data snapshot.
@@ -385,7 +380,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
 
 
 def dump_differs(
-    first: Path, second: Path, output: Path, allowed_diffs: Optional[list[str]] = None
+    first: Path, second: Path, output: Path, allowed_diffs: list[str] | None = None
 ) -> bool:
     """
     Runs diff(1) command on two SQL dumps and write the output to the given output file.
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index c5e3034591..c785036292 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import enum
 import os
 import shutil
+from enum import StrEnum
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
 
@@ -16,7 +17,7 @@ from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
 
 if TYPE_CHECKING:
     from types import TracebackType
-    from typing import Optional, TypedDict, Union
+    from typing import TypedDict
 
     from fixtures.neon_fixtures import NeonEnv
     from fixtures.pg_version import PgVersion
@@ -26,15 +27,15 @@ if TYPE_CHECKING:
         metric_name: str
         type: str
         help: str
-        key_labels: Optional[list[str]]
-        values: Optional[list[str]]
-        query: Optional[str]
-        query_ref: Optional[str]
+        key_labels: list[str] | None
+        values: list[str] | None
+        query: str | None
+        query_ref: str | None
 
     class Collector(TypedDict):
         collector_name: str
         metrics: list[Metric]
-        queries: Optional[list[Query]]
+        queries: list[Query] | None
 
     class Query(TypedDict):
         query_name: str
@@ -53,12 +54,12 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]:
     if not rel:
         raise RuntimeError("Empty filename")
 
-    full_path: Optional[str] = None
+    full_path: str | None = None
     if os.path.isabs(rel):
         full_path = rel
     else:
         for p in (dir, *JSONNET_PATH):
-            assert isinstance(p, (str, Path)), "for mypy"
+            assert isinstance(p, str | Path), "for mypy"
             full_path = os.path.join(p, rel)
 
             assert isinstance(full_path, str), "for mypy"
@@ -82,9 +83,9 @@ def __import_callback(dir: str, rel: str) -> tuple[str, bytes]:
 
 
 def jsonnet_evaluate_file(
-    jsonnet_file: Union[str, Path],
-    ext_vars: Optional[Union[str, dict[str, str]]] = None,
-    tla_vars: Optional[Union[str, dict[str, str]]] = None,
+    jsonnet_file: str | Path,
+    ext_vars: str | dict[str, str] | None = None,
+    tla_vars: str | dict[str, str] | None = None,
 ) -> str:
     return cast(
         "str",
@@ -102,7 +103,7 @@ def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str:
 
 
 def evaluate_config(
-    jsonnet_file: Path, collector_name: str, collector_file: Union[str, Path], connstr: str
+    jsonnet_file: Path, collector_name: str, collector_file: str | Path, connstr: str
 ) -> str:
     return jsonnet_evaluate_file(
         jsonnet_file,
@@ -115,7 +116,7 @@ def evaluate_config(
 
 
 @enum.unique
-class SqlExporterProcess(str, enum.Enum):
+class SqlExporterProcess(StrEnum):
     COMPUTE = "compute"
     AUTOSCALING = "autoscaling"
 
@@ -191,9 +192,9 @@ class SqlExporterRunner:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         self.stop()
 
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index e517e83e6f..8fb74f46e4 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -13,7 +13,7 @@ from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 def handle_db(dbs, roles, operation):
@@ -97,9 +97,9 @@ class DdlForwardingContext:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         self.pg.stop()
 
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index c8d3b2ff3e..1807511008 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -5,6 +5,7 @@ import time
 from collections import Counter
 from collections.abc import Iterable
 from dataclasses import dataclass
+from enum import StrEnum
 from typing import TYPE_CHECKING
 
 import pytest
@@ -80,7 +81,7 @@ def test_min_resident_size_override_handling(
 
 
 @enum.unique
-class EvictionOrder(str, enum.Enum):
+class EvictionOrder(StrEnum):
     RELATIVE_ORDER_EQUAL = "relative_equal"
     RELATIVE_ORDER_SPARE = "relative_spare"
 
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
index 2916748925..9c9bc5b519 100644
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -2,16 +2,12 @@ from __future__ import annotations
 
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo
 from fixtures.utils import human_bytes, skip_in_debug_build
 
-if TYPE_CHECKING:
-    from typing import Union
-
 
 @skip_in_debug_build("debug run is unnecessarily slow")
 def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
@@ -109,14 +105,12 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
 
 @dataclass
 class Histogram:
-    buckets: list[Union[int, float]]
+    buckets: list[int | float]
     counts: list[int]
     sums: list[int]
 
 
-def histogram_historic_layers(
-    infos: LayerMapInfo, minimum_sizes: list[Union[int, float]]
-) -> Histogram:
+def histogram_historic_layers(infos: LayerMapInfo, minimum_sizes: list[int | float]) -> Histogram:
     def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo:
         log.info(
             f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)"
@@ -128,7 +122,7 @@ def histogram_historic_layers(
     return histogram(sizes, minimum_sizes)
 
 
-def histogram(sizes: Iterable[int], minimum_sizes: list[Union[int, float]]) -> Histogram:
+def histogram(sizes: Iterable[int], minimum_sizes: list[int | float]) -> Histogram:
     assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1))
     buckets = list(enumerate(minimum_sizes))
     counts = [0 for _ in buckets]
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 8b41d0cb1c..7f0b541128 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import re
 import time
 from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime, timedelta, timezone
+from datetime import UTC, datetime, timedelta
 
 import pytest
 from fixtures.common_types import Lsn
@@ -207,7 +207,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
     for i in range(1000):
         cur.execute("INSERT INTO foo VALUES(%s)", (i,))
         # Get the timestamp at UTC
-        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc)
+        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=UTC)
         after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()")
         tbl.append([i, after_timestamp, after_lsn])
         time.sleep(0.02)
@@ -273,11 +273,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
             )
             log.info("result: %s, after_ts: %s", result, after_timestamp)
 
-            # TODO use fromisoformat once we have Python 3.11+
-            # which has https://github.com/python/cpython/pull/92177
-            timestamp = datetime.strptime(result, "%Y-%m-%dT%H:%M:%S.%f000Z").replace(
-                tzinfo=timezone.utc
-            )
+            timestamp = datetime.fromisoformat(result).replace(tzinfo=UTC)
             assert timestamp < after_timestamp, "after_timestamp after timestamp"
             if i > 1:
                 before_timestamp = tbl[i - step_size][1]
diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
index 5eaba78331..f0f12290cc 100644
--- a/test_runner/regress/test_ondemand_slru_download.py
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import Optional
-
 import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
@@ -13,7 +11,7 @@ from fixtures.utils import query_scalar
 # Test on-demand download of the pg_xact SLRUs
 #
 @pytest.mark.parametrize("shard_count", [None, 4])
-def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
 
@@ -79,7 +77,7 @@ def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count
 
 
 @pytest.mark.parametrize("shard_count", [None, 4])
-def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index d1b70b9ee6..05e81b82e0 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import Optional
-
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
@@ -82,7 +80,7 @@ def expect_updated_msg_lsn(
     client: PageserverHttpClient,
     tenant_id: TenantId,
     timeline_id: TimelineId,
-    prev_msg_lsn: Optional[Lsn],
+    prev_msg_lsn: Lsn | None,
 ) -> Lsn:
     timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id)
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 4f59efb8b3..d5bbfbc7fc 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -11,11 +11,10 @@ of the pageserver are:
 
 from __future__ import annotations
 
-import enum
 import os
 import re
 import time
-from typing import TYPE_CHECKING
+from enum import StrEnum
 
 import pytest
 from fixtures.common_types import TenantId, TimelineId
@@ -41,10 +40,6 @@ from fixtures.remote_storage import (
 from fixtures.utils import run_only_on_default_postgres, wait_until
 from fixtures.workload import Workload
 
-if TYPE_CHECKING:
-    from typing import Optional
-
-
 # A tenant configuration that is convenient for generating uploads and deletions
 # without a large amount of postgres traffic.
 TENANT_CONF = {
@@ -65,7 +60,7 @@ TENANT_CONF = {
 
 
 def read_all(
-    env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
+    env: NeonEnv, tenant_id: TenantId | None = None, timeline_id: TimelineId | None = None
 ):
     if tenant_id is None:
         tenant_id = env.initial_tenant
@@ -286,12 +281,12 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     assert get_deletion_queue_unexpected_errors(ps_http) == 0
 
 
-class KeepAttachment(str, enum.Enum):
+class KeepAttachment(StrEnum):
     KEEP = "keep"
     LOSE = "lose"
 
 
-class ValidateBefore(str, enum.Enum):
+class ValidateBefore(StrEnum):
     VALIDATE = "validate"
     NO_VALIDATE = "no-validate"
 
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index 200a323a3a..590354e9da 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import asyncio
 import time
-from typing import TYPE_CHECKING
 
 import psutil
 import pytest
@@ -17,17 +16,13 @@ from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.utils import skip_in_debug_build, wait_until
 
-if TYPE_CHECKING:
-    from typing import Optional
-
-
 TIMELINE_COUNT = 10
 ENTRIES_PER_TIMELINE = 10_000
 CHECKPOINT_TIMEOUT_SECONDS = 60
 
 
 async def run_worker_for_tenant(
-    env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None
+    env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None
 ) -> Lsn:
     if offset is None:
         offset = 0
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index fb6050689c..4bf5705517 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import random
 from contextlib import closing
-from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
@@ -156,7 +155,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 @pytest.mark.timeout(540)
 @pytest.mark.parametrize("shard_count", [None, 4])
 @skip_in_debug_build("times out in debug builds")
-def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
     # same rationale as with the immediate stop; we might leave orphan layers behind.
     neon_env_builder.disable_scrub_on_exit()
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 12134048e6..de0344bc29 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -23,7 +23,7 @@ from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
 if TYPE_CHECKING:
-    from typing import Any, Optional, Union
+    from typing import Any
 
 
 # A tenant configuration that is convenient for generating uploads and deletions
@@ -199,7 +199,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
                 # state if it was running attached with a stale generation
                 last_state[pageserver.id] = ("Detached", None)
         else:
-            secondary_conf: Optional[dict[str, Any]] = None
+            secondary_conf: dict[str, Any] | None = None
             if mode == "Secondary":
                 secondary_conf = {"warm": rng.choice([True, False])}
 
@@ -469,7 +469,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
 
 
 def list_elegible_layers(
-    pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    pageserver, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId
 ) -> list[Path]:
     """
     The subset of layer filenames that are elegible for secondary download: at time of writing this
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 6a5e388c53..2877f14e0e 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -21,8 +21,6 @@ from fixtures.remote_storage import s3_storage
 from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
-    from typing import Optional
-
     from fixtures.neon_fixtures import PgBin
     from pytest import CaptureFixture
 
@@ -48,7 +46,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
     data properly.
     """
 
-    ignored_files: Optional[list[str]] = None
+    ignored_files: list[str] | None = None
 
     # Neon handles unlogged relations in a special manner. During a
     # basebackup, we ship the init fork as the main fork. This presents a
@@ -131,7 +129,7 @@ def test_pg_regress(
     capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
-    shard_count: Optional[int],
+    shard_count: int | None,
 ):
     DBNAME = "regression"
 
@@ -205,7 +203,7 @@ def test_isolation(
     capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
-    shard_count: Optional[int],
+    shard_count: int | None,
 ):
     DBNAME = "isolation_regression"
 
@@ -274,7 +272,7 @@ def test_sql_regress(
     capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
-    shard_count: Optional[int],
+    shard_count: int | None,
 ):
     DBNAME = "regression"
 
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index e59d46e352..5a01d90d85 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -13,7 +13,7 @@ import requests
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 GET_CONNECTION_PID_QUERY = "SELECT pid FROM pg_stat_activity WHERE state = 'active'"
@@ -228,7 +228,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy):
 def test_sql_over_http(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http with login password 'http' superuser")
 
-    def q(sql: str, params: Optional[list[Any]] = None) -> Any:
+    def q(sql: str, params: list[Any] | None = None) -> Any:
         params = params or []
         connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
         response = requests.post(
@@ -291,7 +291,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy):
         )
     )
 
-    def q(sql: str, params: Optional[list[Any]] = None) -> Any:
+    def q(sql: str, params: list[Any] | None = None) -> Any:
         params = params or []
         connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}"
         response = requests.post(
@@ -310,7 +310,7 @@ def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy):
 def test_sql_over_http_output_options(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http2 with login password 'http2' superuser")
 
-    def q(sql: str, raw_text: bool, array_mode: bool, params: Optional[list[Any]] = None) -> Any:
+    def q(sql: str, raw_text: bool, array_mode: bool, params: list[Any] | None = None) -> Any:
         params = params or []
         connstr = (
             f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
@@ -346,7 +346,7 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http with login password 'http' superuser")
 
     def qq(
-        queries: list[tuple[str, Optional[list[Any]]]],
+        queries: list[tuple[str, list[Any] | None]],
         read_only: bool = False,
         deferrable: bool = False,
     ) -> Any:
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 826136d5f9..2b274003a3 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import time
-from typing import Union
 
 import pytest
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
@@ -175,7 +174,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 
     def get_layers_protected_by_lease(
         ps_http: PageserverHttpClient,
-        tenant_id: Union[TenantId, TenantShardId],
+        tenant_id: TenantId | TenantShardId,
         timeline_id: TimelineId,
         lease_lsn: Lsn,
     ) -> set[str]:
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 79b5ebe39a..137e75f784 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -5,7 +5,6 @@ import queue
 import shutil
 import threading
 import time
-from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
@@ -37,9 +36,6 @@ from fixtures.utils import (
 )
 from requests import ReadTimeout
 
-if TYPE_CHECKING:
-    from typing import Optional
-
 
 #
 # Tests that a piece of data is backed up and restored correctly:
@@ -452,7 +448,7 @@ def test_remote_timeline_client_calls_started_metric(
         for (file_kind, op_kind), observations in calls_started.items():
             log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}")
             assert all(
-                x < y for x, y in zip(observations, observations[1:])
+                x < y for x, y in zip(observations, observations[1:], strict=False)
             ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
 
     def churn(data_pass1, data_pass2):
@@ -731,7 +727,7 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
     # sleep a bit to force the upload task go into exponential backoff
     time.sleep(1)
 
-    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    q: queue.Queue[PageserverApiException | None] = queue.Queue()
     barrier = threading.Barrier(2)
 
     def create_in_background():
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 7a9e6d62b2..8764da3c2f 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import time
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
@@ -77,7 +77,7 @@ def test_tenant_s3_restore(
 
     # These sleeps are important because they fend off differences in clocks between us and S3
     time.sleep(4)
-    ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    ts_before_deletion = datetime.now(tz=UTC).replace(tzinfo=None)
     time.sleep(4)
 
     assert (
@@ -104,7 +104,7 @@ def test_tenant_s3_restore(
     )
 
     time.sleep(4)
-    ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    ts_after_deletion = datetime.now(tz=UTC).replace(tzinfo=None)
     time.sleep(4)
 
     ps_http.tenant_time_travel_remote_storage(
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 3194fe6ec4..16bfa83b43 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import os
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import pytest
 import requests
@@ -27,9 +27,6 @@ from typing_extensions import override
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
-if TYPE_CHECKING:
-    from typing import Optional, Union
-
 
 def test_sharding_smoke(
     neon_env_builder: NeonEnvBuilder,
@@ -189,7 +186,7 @@ def test_sharding_split_unsharded(
     ],
 )
 def test_sharding_split_compaction(
-    neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str
+    neon_env_builder: NeonEnvBuilder, failpoint: str | None, build_type: str
 ):
     """
     Test that after a split, we clean up parent layer data in the child shards via compaction.
@@ -782,7 +779,7 @@ def test_sharding_split_stripe_size(
     tenant_id = env.initial_tenant
 
     assert len(notifications) == 1
-    expect: dict[str, Union[list[dict[str, int]], str, None, int]] = {
+    expect: dict[str, list[dict[str, int]] | str | None | int] = {
         "tenant_id": str(env.initial_tenant),
         "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
@@ -798,7 +795,7 @@ def test_sharding_split_stripe_size(
     # Check that we ended up with the stripe size that we expected, both on the pageserver
     # and in the notifications to compute
     assert len(notifications) == 2
-    expect_after: dict[str, Union[list[dict[str, int]], str, None, int]] = {
+    expect_after: dict[str, list[dict[str, int]] | str | None | int] = {
         "tenant_id": str(env.initial_tenant),
         "stripe_size": new_stripe_size,
         "shards": [
@@ -1046,7 +1043,7 @@ def test_sharding_ingest_gaps(
 
 
 class Failure:
-    pageserver_id: Optional[int]
+    pageserver_id: int | None
 
     def apply(self, env: NeonEnv):
         raise NotImplementedError()
@@ -1370,7 +1367,7 @@ def test_sharding_split_failures(
 
         assert attached_count == initial_shard_count
 
-    def assert_split_done(exclude_ps_id: Optional[int] = None) -> None:
+    def assert_split_done(exclude_ps_id: int | None = None) -> None:
         secondary_count = 0
         attached_count = 0
         for ps in env.pageservers:
diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py
index 402f27b384..ef9974a15d 100644
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -4,16 +4,12 @@ import socket
 import subprocess
 from pathlib import Path
 from types import TracebackType
-from typing import TYPE_CHECKING
 
 import backoff
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import PgProtocol, VanillaPostgres
 from fixtures.port_distributor import PortDistributor
 
-if TYPE_CHECKING:
-    from typing import Optional
-
 
 def generate_tls_cert(cn, certout, keyout):
     subprocess.run(
@@ -55,7 +51,7 @@ class PgSniRouter(PgProtocol):
         self.destination = destination
         self.tls_cert = tls_cert
         self.tls_key = tls_key
-        self._popen: Optional[subprocess.Popen[bytes]] = None
+        self._popen: subprocess.Popen[bytes] | None = None
         self.test_output_dir = test_output_dir
 
     def start(self) -> PgSniRouter:
@@ -96,9 +92,9 @@ class PgSniRouter(PgProtocol):
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc: Optional[BaseException],
-        tb: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc: BaseException | None,
+        tb: TracebackType | None,
     ):
         if self._popen is not None:
             self._popen.terminate()
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2c3d79b18a..dbddc55823 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -5,7 +5,7 @@ import json
 import threading
 import time
 from collections import defaultdict
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from enum import Enum
 from typing import TYPE_CHECKING
 
@@ -56,7 +56,7 @@ from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
 if TYPE_CHECKING:
-    from typing import Any, Optional, Union
+    from typing import Any
 
 
 def get_node_shard_counts(env: NeonEnv, tenant_ids):
@@ -593,7 +593,7 @@ def test_storage_controller_compute_hook(
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
-    expect: dict[str, Union[list[dict[str, int]], str, None, int]] = {
+    expect: dict[str, list[dict[str, int]] | str | None | int] = {
         "tenant_id": str(env.initial_tenant),
         "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
@@ -708,7 +708,7 @@ def test_storage_controller_stuck_compute_hook(
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
-    expect: dict[str, Union[list[dict[str, int]], str, None, int]] = {
+    expect: dict[str, list[dict[str, int]] | str | None | int] = {
         "tenant_id": str(env.initial_tenant),
         "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
@@ -1048,7 +1048,7 @@ def test_storage_controller_s3_time_travel_recovery(
     )
 
     time.sleep(4)
-    ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    ts_before_disaster = datetime.now(tz=UTC).replace(tzinfo=None)
     time.sleep(4)
 
     # Simulate a "disaster": delete some random files from remote storage for one of the shards
@@ -1072,7 +1072,7 @@ def test_storage_controller_s3_time_travel_recovery(
         pass
 
     time.sleep(4)
-    ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    ts_after_disaster = datetime.now(tz=UTC).replace(tzinfo=None)
     time.sleep(4)
 
     # Do time travel recovery
@@ -2274,7 +2274,7 @@ def test_storage_controller_node_deletion(
 @pytest.mark.parametrize("shard_count", [None, 2])
 def test_storage_controller_metadata_health(
     neon_env_builder: NeonEnvBuilder,
-    shard_count: Optional[int],
+    shard_count: int | None,
 ):
     """
     Create three tenants A, B, C.
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 11ad2173ae..3991bd7061 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -6,7 +6,6 @@ import shutil
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
-from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -20,12 +19,9 @@ from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
-if TYPE_CHECKING:
-    from typing import Optional
-
 
 @pytest.mark.parametrize("shard_count", [None, 4])
-def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
     """
     Test the `tenant-snapshot` subcommand, which grabs data from remote storage
 
@@ -131,7 +127,7 @@ def drop_local_state(env: NeonEnv, tenant_id: TenantId):
 
 
 @pytest.mark.parametrize("shard_count", [None, 4])
-def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.num_pageservers = 2
 
@@ -179,9 +175,7 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
 
 
 @pytest.mark.parametrize("shard_count", [None, 2])
-def test_scrubber_physical_gc_ancestors(
-    neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]
-):
+def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.num_pageservers = 2
 
@@ -499,7 +493,7 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
 
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_scrubber_scan_pageserver_metadata(
-    neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]
+    neon_env_builder: NeonEnvBuilder, shard_count: int | None
 ):
     """
     Create some layers. Delete an object listed in index. Run scrubber and see if it detects the defect.
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 59c14b3263..8d7ca7bc4e 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,11 +1,10 @@
 from __future__ import annotations
 
 import asyncio
-import enum
 import random
 import time
+from enum import StrEnum
 from threading import Thread
-from typing import TYPE_CHECKING
 
 import asyncpg
 import pytest
@@ -28,10 +27,6 @@ from fixtures.remote_storage import (
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample
 
-if TYPE_CHECKING:
-    from typing import Optional
-
-
 # In tests that overlap endpoint activity with tenant attach/detach, there are
 # a variety of warnings that the page service may emit when it cannot acquire
 # an active tenant to serve a request
@@ -57,7 +52,7 @@ def do_gc_target(
         log.info("gc http thread returning")
 
 
-class ReattachMode(str, enum.Enum):
+class ReattachMode(StrEnum):
     REATTACH_EXPLICIT = "explicit"
     REATTACH_RESET = "reset"
     REATTACH_RESET_DROP = "reset_drop"
@@ -498,7 +493,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         r".* Changing Active tenant to Broken state, reason: broken from test"
     )
 
-    def only_int(samples: list[Sample]) -> Optional[int]:
+    def only_int(samples: list[Sample]) -> int | None:
         if len(samples) == 1:
             return int(samples[0].value)
         assert len(samples) == 0
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index fc9adb14c9..bf6120aa0a 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -28,7 +28,7 @@ from fixtures.utils import (
 )
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
@@ -78,7 +78,7 @@ def populate_branch(
     tenant_id: TenantId,
     ps_http: PageserverHttpClient,
     create_table: bool,
-    expected_sum: Optional[int],
+    expected_sum: int | None,
 ) -> tuple[TimelineId, Lsn]:
     # insert some data
     with pg_cur(endpoint) as cur:
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 0650f12cd1..bc2e048f69 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -4,7 +4,6 @@ import json
 import random
 import threading
 import time
-from typing import Optional
 
 import pytest
 import requests
@@ -661,7 +660,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
     ],
 )
 def test_timeline_retain_lsn(
-    neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: Optional[str]
+    neon_env_builder: NeonEnvBuilder, with_intermediary: bool, offload_child: str | None
 ):
     """
     Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index ef0eb05612..1547ebc35d 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -5,6 +5,7 @@ import enum
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
+from enum import StrEnum
 from queue import Empty, Queue
 from threading import Barrier
 
@@ -36,7 +37,7 @@ def layer_name(info: HistoricLayerInfo) -> str:
 
 
 @enum.unique
-class Branchpoint(str, enum.Enum):
+class Branchpoint(StrEnum):
     """
     Have branches at these Lsns possibly relative to L0 layer boundary.
     """
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index c19c78e251..5a5ca3290a 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -3,7 +3,6 @@ from __future__ import annotations
 import time
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.log_helper import log
@@ -14,9 +13,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import wait_timeline_detail_404
 
-if TYPE_CHECKING:
-    from typing import Optional
-
 
 @pytest.mark.parametrize("sharded", [True, False])
 def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool):
@@ -89,7 +85,7 @@ def wait_for_another_gc_round():
 @dataclass
 class ScrollableLog:
     pageserver: NeonPageserver
-    offset: Optional[LogCursor]
+    offset: LogCursor | None
 
     def assert_log_contains(self, what: str):
         msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 85c6d17142..4528bc6180 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -7,7 +7,6 @@ import time
 from collections import defaultdict
 from contextlib import closing
 from pathlib import Path
-from typing import Optional
 
 import psycopg2.errors
 import psycopg2.extras
@@ -668,7 +667,7 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
 class TimelinePhysicalSizeValues:
     api_current_physical: int
     prometheus_resident_physical: float
-    prometheus_remote_physical: Optional[float] = None
+    prometheus_remote_physical: float | None = None
     python_timelinedir_layerfiles_physical: int
     layer_map_file_size_sum: int
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 6eaaa3c37f..4c404cd881 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -61,7 +61,7 @@ from fixtures.utils import (
 )
 
 if TYPE_CHECKING:
-    from typing import Any, Optional
+    from typing import Any
 
 
 def wait_lsn_force_checkpoint(
@@ -189,7 +189,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
                 m.flush_lsns.append(Lsn(int(sk_m.flush_lsn_inexact(tenant_id, timeline_id))))
                 m.commit_lsns.append(Lsn(int(sk_m.commit_lsn_inexact(tenant_id, timeline_id))))
 
-            for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
+            for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns, strict=False):
                 # Invariant. May be < when transaction is in progress.
                 assert (
                     commit_lsn <= flush_lsn
@@ -224,7 +224,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
         def __init__(self) -> None:
             super().__init__(daemon=True)
             self.should_stop = threading.Event()
-            self.exception: Optional[BaseException] = None
+            self.exception: BaseException | None = None
 
         def run(self) -> None:
             try:
@@ -521,7 +521,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
     # Shut down subsequently each of safekeepers and fill a segment while sk is
     # down; ensure segment gets offloaded by others.
     offloaded_seg_end = [Lsn("0/2000000"), Lsn("0/3000000"), Lsn("0/4000000")]
-    for victim, seg_end in zip(env.safekeepers, offloaded_seg_end):
+    for victim, seg_end in zip(env.safekeepers, offloaded_seg_end, strict=False):
         victim.stop()
         # roughly fills one segment
         cur.execute("insert into t select generate_series(1,250000), 'payload'")
@@ -666,7 +666,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
 
     # recreate timeline on pageserver from scratch
     ps_http.timeline_create(
-        pg_version=PgVersion(pg_version),
+        pg_version=PgVersion(str(pg_version)),
         tenant_id=tenant_id,
         new_timeline_id=timeline_id,
     )
@@ -1177,14 +1177,14 @@ def cmp_sk_wal(sks: list[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
     # report/understand if WALs are different due to that.
     statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
     term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses]
-    for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
+    for tfl, sk in zip(term_flush_lsns[1:], sks[1:], strict=False):
         assert (
             term_flush_lsns[0] == tfl
         ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
 
     # check that WALs are identic.
     segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
-    for cmp_segs, sk in zip(segs[1:], sks[1:]):
+    for cmp_segs, sk in zip(segs[1:], sks[1:], strict=False):
         assert (
             segs[0] == cmp_segs
         ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}"
@@ -1455,10 +1455,10 @@ class SafekeeperEnv:
         self.pg_bin = pg_bin
         self.num_safekeepers = num_safekeepers
         self.bin_safekeeper = str(neon_binpath / "safekeeper")
-        self.safekeepers: Optional[list[subprocess.CompletedProcess[Any]]] = None
-        self.postgres: Optional[ProposerPostgres] = None
-        self.tenant_id: Optional[TenantId] = None
-        self.timeline_id: Optional[TimelineId] = None
+        self.safekeepers: list[subprocess.CompletedProcess[Any]] | None = None
+        self.postgres: ProposerPostgres | None = None
+        self.tenant_id: TenantId | None = None
+        self.timeline_id: TimelineId | None = None
 
     def init(self) -> SafekeeperEnv:
         assert self.postgres is None, "postgres is already initialized"
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index d3e989afa8..18408b0619 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -5,7 +5,6 @@ import random
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING
 
 import asyncpg
 import pytest
@@ -16,10 +15,6 @@ from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import skip_in_debug_build
 
-if TYPE_CHECKING:
-    from typing import Optional
-
-
 log = getLogger("root.safekeeper_async")
 
 
@@ -261,7 +256,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
 
 
 def endpoint_create_start(
-    env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False
+    env: NeonEnv, branch: str, pgdir_name: str | None, allow_multiple: bool = False
 ):
     endpoint = Endpoint(
         env,
@@ -287,7 +282,7 @@ async def exec_compute_query(
     env: NeonEnv,
     branch: str,
     query: str,
-    pgdir_name: Optional[str] = None,
+    pgdir_name: str | None = None,
     allow_multiple: bool = False,
 ):
     with endpoint_create_start(
@@ -705,7 +700,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
         # invalid, to make them unavailable to the endpoint.  We use
         # ports 10, 11 and 12 to simulate unavailable safekeepers.
         config = toml.load(test_output_dir / "repo" / "config")
-        for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk)):
+        for i, (_sk, active) in enumerate(zip(env.safekeepers, active_sk, strict=False)):
             if active:
                 config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg
             else:

From 725a5ff00350bfae3d76c459a93b1c1c55c69f80 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 21 Nov 2024 16:46:30 +0000
Subject: [PATCH 23/51] fix(proxy): CancelKeyData display log masking (#9838)

Fixes the masking for the CancelKeyData display format. Due to negative
i32 cast to u64, the top-bits all had `0xffffffff` prefix. On the
bitwise-or that followed, these took priority.

This PR also compresses 3 logs during sql-over-http into 1 log with
durations as label fields, as prior discussed.
---
 libs/pq_proto/src/lib.rs              | 12 ++++++++++--
 proxy/src/serverless/sql_over_http.rs | 12 ++++++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index b9e5387d86..6c40968496 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -184,9 +184,8 @@ pub struct CancelKeyData {
 
 impl fmt::Display for CancelKeyData {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        // TODO: this is producing strange results, with 0xffffffff........ always in the logs.
         let hi = (self.backend_pid as u64) << 32;
-        let lo = self.cancel_key as u64;
+        let lo = (self.cancel_key as u64) & 0xffffffff;
         let id = hi | lo;
 
         // This format is more compact and might work better for logs.
@@ -1047,4 +1046,13 @@ mod tests {
         let data = [0, 0, 0, 7, 0, 0, 0, 0];
         FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
     }
+
+    #[test]
+    fn cancel_key_data() {
+        let key = CancelKeyData {
+            backend_pid: -1817212860,
+            cancel_key: -1183897012,
+        };
+        assert_eq!(format!("{key}"), "CancelKeyData(93af8844b96f2a4c)");
+    }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 03b37bccd5..afd93d02f0 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -14,7 +14,7 @@ use hyper::{header, HeaderMap, Request, Response, StatusCode};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
-use tokio::time;
+use tokio::time::{self, Instant};
 use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
 use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
@@ -980,10 +980,11 @@ async fn query_to_json<T: GenericClient>(
     current_size: &mut usize,
     parsed_headers: HttpHeaders,
 ) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> {
-    info!("executing query");
+    let query_start = Instant::now();
+
     let query_params = data.params;
     let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
-    info!("finished executing query");
+    let query_acknowledged = Instant::now();
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
@@ -1002,6 +1003,7 @@ async fn query_to_json<T: GenericClient>(
         }
     }
 
+    let query_resp_end = Instant::now();
     let ready = row_stream.ready_status();
 
     // grab the command tag and number of rows affected
@@ -1021,7 +1023,9 @@ async fn query_to_json<T: GenericClient>(
         rows = rows.len(),
         ?ready,
         command_tag,
-        "finished reading rows"
+        acknowledgement = ?(query_acknowledged - query_start),
+        response = ?(query_resp_end - query_start),
+        "finished executing query"
     );
 
     let columns_len = row_stream.columns().len();

From 190e8cebac246e54c081b14382c92499e9e6e52d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 21 Nov 2024 19:59:46 +0100
Subject: [PATCH 24/51] safekeeper,pageserver: add CPU profiling (#9764)

## Problem

We don't have a convenient way to gather CPU profiles from a running
binary, e.g. during production incidents or end-to-end benchmarks, nor
during microbenchmarks (particularly on macOS).

We would also like to have continuous profiling in production, likely
using [Grafana Cloud
Profiles](https://grafana.com/products/cloud/profiles-for-continuous-profiling/).
We may choose to use either eBPF profiles or pprof profiles for this
(pending testing and discussion with SREs), but pprof profiles appear
useful regardless for the reasons listed above. See
https://github.com/neondatabase/cloud/issues/14888.

This PR is intended as a proof of concept, to try it out in staging and
drive further discussions about profiling more broadly.

Touches #9534.
Touches https://github.com/neondatabase/cloud/issues/14888.

## Summary of changes

Adds a HTTP route `/profile/cpu` that takes a CPU profile and returns
it. Defaults to a 5-second pprof Protobuf profile for use with e.g.
`pprof` or Grafana Alloy, but can also emit an SVG flamegraph. Query
parameters:

* `format`: output format (`pprof` or `svg`)
* `frequency`: sampling frequency in microseconds (default 100)
* `seconds`: number of seconds to profile (default 5)

Also integrates pprof profiles into Criterion benchmarks, such that
flamegraph reports can be taken with `cargo bench ... --profile-duration
<seconds>`. Output under `target/criterion/*/profile/flamegraph.svg`.

Example profiles:

* pprof profile (use [`pprof`](https://github.com/google/pprof)):
[profile.pb.gz](https://github.com/user-attachments/files/17756788/profile.pb.gz)
  * Web interface: `pprof -http :6060 profile.pb.gz`
* Interactive flamegraph:
[profile.svg.gz](https://github.com/user-attachments/files/17756782/profile.svg.gz)
---
 Cargo.lock                        | 201 +++++++++++++++++++++++++++++-
 Cargo.toml                        |   1 +
 libs/utils/Cargo.toml             |   1 +
 libs/utils/src/http/endpoint.rs   |  87 ++++++++++++-
 libs/utils/src/http/request.rs    |   2 +-
 pageserver/src/http/routes.rs     |  16 ++-
 safekeeper/Cargo.toml             |   1 +
 safekeeper/benches/README.md      |   4 +
 safekeeper/benches/receive_wal.rs |   6 +-
 safekeeper/src/http/routes.rs     |   7 +-
 workspace_hack/Cargo.toml         |   1 +
 11 files changed, 313 insertions(+), 14 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c7af140f7d..b1232a6b6a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -46,6 +46,15 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "aligned-vec"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e0966165eaf052580bd70eb1b32cb3d6245774c0104d1b2793e9650bf83b52a"
+dependencies = [
+ "equator",
+]
+
 [[package]]
 name = "allocator-api2"
 version = "0.2.16"
@@ -146,6 +155,12 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
 [[package]]
 name = "asn1-rs"
 version = "0.6.2"
@@ -742,7 +757,7 @@ dependencies = [
  "once_cell",
  "paste",
  "pin-project",
- "quick-xml",
+ "quick-xml 0.31.0",
  "rand 0.8.5",
  "reqwest 0.11.19",
  "rustc_version",
@@ -1381,6 +1396,15 @@ version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
 
+[[package]]
+name = "cpp_demangle"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96e58d342ad113c2b878f16d5d034c03be492ae460cdbc02b7f0f2284d310c7d"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.9"
@@ -1904,6 +1928,26 @@ dependencies = [
  "termcolor",
 ]
 
+[[package]]
+name = "equator"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c35da53b5a021d2484a7cc49b2ac7f2d840f8236a286f84202369bd338d761ea"
+dependencies = [
+ "equator-macro",
+]
+
+[[package]]
+name = "equator-macro"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.1"
@@ -2011,6 +2055,18 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "findshlibs"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
+dependencies = [
+ "cc",
+ "lazy_static",
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "fixedbitset"
 version = "0.4.2"
@@ -2714,6 +2770,24 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac"
 
+[[package]]
+name = "inferno"
+version = "0.11.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
+dependencies = [
+ "ahash",
+ "indexmap 2.0.1",
+ "is-terminal",
+ "itoa",
+ "log",
+ "num-format",
+ "once_cell",
+ "quick-xml 0.26.0",
+ "rgb",
+ "str_stack",
+]
+
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -3053,6 +3127,15 @@ version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
+[[package]]
+name = "memmap2"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.7.1"
@@ -3278,6 +3361,16 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
 
+[[package]]
+name = "num-format"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
+dependencies = [
+ "arrayvec",
+ "itoa",
+]
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -4108,6 +4201,31 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
 
+[[package]]
+name = "pprof"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebbe2f8898beba44815fdc9e5a4ae9c929e21c5dc29b0c774a15555f7f58d6d0"
+dependencies = [
+ "aligned-vec",
+ "backtrace",
+ "cfg-if",
+ "criterion",
+ "findshlibs",
+ "inferno",
+ "libc",
+ "log",
+ "nix 0.26.4",
+ "once_cell",
+ "parking_lot 0.12.1",
+ "protobuf",
+ "protobuf-codegen-pure",
+ "smallvec",
+ "symbolic-demangle",
+ "tempfile",
+ "thiserror",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -4260,6 +4378,31 @@ dependencies = [
  "prost",
 ]
 
+[[package]]
+name = "protobuf"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
+
+[[package]]
+name = "protobuf-codegen"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6"
+dependencies = [
+ "protobuf",
+]
+
+[[package]]
+name = "protobuf-codegen-pure"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865"
+dependencies = [
+ "protobuf",
+ "protobuf-codegen",
+]
+
 [[package]]
 name = "proxy"
 version = "0.1.0"
@@ -4371,6 +4514,15 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "quick-xml"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "quick-xml"
 version = "0.31.0"
@@ -4853,6 +5005,15 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "rgb"
+version = "0.8.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a"
+dependencies = [
+ "bytemuck",
+]
+
 [[package]]
 name = "ring"
 version = "0.17.6"
@@ -5166,6 +5327,7 @@ dependencies = [
  "postgres-protocol",
  "postgres_backend",
  "postgres_ffi",
+ "pprof",
  "pq_proto",
  "rand 0.8.5",
  "regex",
@@ -5712,6 +5874,12 @@ dependencies = [
  "der 0.7.8",
 ]
 
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -5858,6 +6026,12 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "str_stack"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
+
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -5905,6 +6079,29 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
 
+[[package]]
+name = "symbolic-common"
+version = "12.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "366f1b4c6baf6cfefc234bbd4899535fca0b06c74443039a73f6dfb2fad88d77"
+dependencies = [
+ "debugid",
+ "memmap2",
+ "stable_deref_trait",
+ "uuid",
+]
+
+[[package]]
+name = "symbolic-demangle"
+version = "12.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aba05ba5b9962ea5617baf556293720a8b2d0a282aa14ee4bf10e22efc7da8c8"
+dependencies = [
+ "cpp_demangle",
+ "rustc-demangle",
+ "symbolic-common",
+]
+
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -6772,6 +6969,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "postgres_connection",
+ "pprof",
  "pq_proto",
  "rand 0.8.5",
  "regex",
@@ -7340,6 +7538,7 @@ dependencies = [
  "libc",
  "log",
  "memchr",
+ "nix 0.26.4",
  "nom",
  "num-bigint",
  "num-integer",
diff --git a/Cargo.toml b/Cargo.toml
index dbda930535..c6b4b62042 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -130,6 +130,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
+pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13"
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 545317f958..4aad0aee2c 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -29,6 +29,7 @@ jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
+pprof.workspace = true
 regex.workspace = true
 routerify.workspace = true
 serde.workspace = true
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 8ee5abd434..6a85f0ddeb 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,7 +1,8 @@
 use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use anyhow::Context;
-use hyper::header::{HeaderName, AUTHORIZATION};
+use crate::http::request::{get_query_param, parse_query_param};
+use anyhow::{anyhow, Context};
+use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION};
 use hyper::http::HeaderValue;
 use hyper::Method;
 use hyper::{header::CONTENT_TYPE, Body, Request, Response};
@@ -12,11 +13,13 @@ use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tracing::{debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
+use std::io::Write as _;
 use std::str::FromStr;
+use std::time::Duration;
 
 use bytes::{Bytes, BytesMut};
-use std::io::Write as _;
-use tokio::sync::mpsc;
+use pprof::protos::Message as _;
+use tokio::sync::{mpsc, Mutex};
 use tokio_stream::wrappers::ReceiverStream;
 
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -328,6 +331,82 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
     Ok(response)
 }
 
+/// Generates CPU profiles.
+pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    enum Format {
+        Pprof,
+        Svg,
+    }
+
+    // Parameters.
+    let format = match get_query_param(&req, "format")?.as_deref() {
+        None => Format::Pprof,
+        Some("pprof") => Format::Pprof,
+        Some("svg") => Format::Svg,
+        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
+    };
+    let seconds = match parse_query_param(&req, "seconds")? {
+        None => 5,
+        Some(seconds @ 1..=30) => seconds,
+        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
+    };
+    let frequency_hz = match parse_query_param(&req, "frequency")? {
+        None => 99,
+        Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
+        Some(frequency) => frequency,
+    };
+
+    // Only allow one profiler at a time.
+    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+    let _lock = PROFILE_LOCK
+        .try_lock()
+        .map_err(|_| ApiError::Conflict("profiler already running".into()))?;
+
+    // Take the profile.
+    let report = tokio::task::spawn_blocking(move || {
+        let guard = pprof::ProfilerGuardBuilder::default()
+            .frequency(frequency_hz)
+            .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+            .build()?;
+        std::thread::sleep(Duration::from_secs(seconds));
+        guard.report().build()
+    })
+    .await
+    .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+    .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
+
+    // Return the report in the requested format.
+    match format {
+        Format::Pprof => {
+            let mut body = Vec::new();
+            report
+                .pprof()
+                .map_err(|err| ApiError::InternalServerError(err.into()))?
+                .write_to_vec(&mut body)
+                .map_err(|err| ApiError::InternalServerError(err.into()))?;
+
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"profile.pb\"")
+                .body(Body::from(body))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+
+        Format::Svg => {
+            let mut body = Vec::new();
+            report
+                .flamegraph(&mut body)
+                .map_err(|err| ApiError::InternalServerError(err.into()))?;
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "image/svg+xml")
+                .body(Body::from(body))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+    }
+}
+
 pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
     Middleware::pre(move |req| async move {
diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs
index 8b8ed5a67f..7ea71685ec 100644
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -30,7 +30,7 @@ pub fn parse_request_param<T: FromStr>(
     }
 }
 
-fn get_query_param<'a>(
+pub fn get_query_param<'a>(
     request: &'a Request<Body>,
     param_name: &str,
 ) -> Result<Option<Cow<'a, str>>, ApiError> {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 306b0f35ab..9bd1929b0b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -55,6 +55,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::endpoint::profile_cpu_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::request::must_parse_query_param;
@@ -146,10 +147,16 @@ impl State {
         deletion_queue_client: DeletionQueueClient,
         secondary_controller: SecondaryController,
     ) -> anyhow::Result<Self> {
-        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
-            .iter()
-            .map(|v| v.parse().unwrap())
-            .collect::<Vec<_>>();
+        let allowlist_routes = [
+            "/v1/status",
+            "/v1/doc",
+            "/swagger.yml",
+            "/metrics",
+            "/profile/cpu",
+        ]
+        .iter()
+        .map(|v| v.parse().unwrap())
+        .collect::<Vec<_>>();
         Ok(Self {
             conf,
             tenant_manager,
@@ -3167,6 +3174,7 @@ pub fn make_router(
     Ok(router
         .data(state)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
         .put("/v1/failpoints", |r| {
             testing_api_handler("manage failpoints", r, failpoints_handler)
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 85561e4aff..ab77b63d54 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -30,6 +30,7 @@ once_cell.workspace = true
 parking_lot.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
+pprof.workspace = true
 rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
diff --git a/safekeeper/benches/README.md b/safekeeper/benches/README.md
index 4119cc8d6e..d73fbccf05 100644
--- a/safekeeper/benches/README.md
+++ b/safekeeper/benches/README.md
@@ -14,6 +14,10 @@ cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false
 
 # List available benchmarks.
 cargo bench --package safekeeper --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false --profile-time 10
 ```
 
 Additional charts and statistics are available in `target/criterion/report/index.html`.
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index e32d7526ca..c637b4fb24 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -10,6 +10,7 @@ use camino_tempfile::tempfile;
 use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
 use itertools::Itertools as _;
 use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
+use pprof::criterion::{Output, PProfProfiler};
 use safekeeper::receive_wal::{self, WalAcceptor};
 use safekeeper::safekeeper::{
     AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
@@ -24,8 +25,9 @@ const GB: usize = 1024 * MB;
 
 // Register benchmarks with Criterion.
 criterion_group!(
-    benches,
-    bench_process_msg,
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_process_msg,
     bench_wal_acceptor,
     bench_wal_acceptor_throughput,
     bench_file_write
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index df68f8a68e..9a5a1c58b6 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -14,7 +14,9 @@ use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
 use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter};
+use utils::http::endpoint::{
+    profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter,
+};
 use utils::http::request::parse_query_param;
 
 use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -574,7 +576,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         router = router.middleware(auth_middleware(|request| {
             #[allow(clippy::mutable_key_type)]
             static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> = Lazy::new(|| {
-                ["/v1/status", "/metrics"]
+                ["/v1/status", "/metrics", "/pprof/profile"]
                     .iter()
                     .map(|v| v.parse().unwrap())
                     .collect()
@@ -598,6 +600,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .data(Arc::new(conf))
         .data(auth)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 53d3a7364b..667d54df02 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -52,6 +52,7 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
+nix = { version = "0.26" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }

From 37962e729e596bf6cb48bf8743f0a3b1372d600f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 21 Nov 2024 14:19:02 -0600
Subject: [PATCH 25/51] Fix panic in compute_ctl metrics collection (#9831)

Calling unwrap on the encoder is a little overzealous. One of the errors
that can be returned by the encode function in particular is the
non-existence of metrics for a metric family, so we should prematurely
filter instances like that out. I believe that the cause of this panic
was caused by a race condition between the prometheus collector and the
compute collecting the installed extensions metric for the first time.
The HTTP server is spawned on a separate thread before we even start
bringing up Postgres.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/api.rs | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 3677582c11..8a047634df 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -20,6 +20,7 @@ use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use metrics::proto::MetricFamily;
 use metrics::Encoder;
 use metrics::TextEncoder;
 use tokio::task;
@@ -72,10 +73,22 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
         (&Method::GET, "/metrics") => {
             debug!("serving /metrics GET request");
 
-            let mut buffer = vec![];
-            let metrics = installed_extensions::collect();
+            // When we call TextEncoder::encode() below, it will immediately
+            // return an error if a metric family has no metrics, so we need to
+            // preemptively filter out metric families with no metrics.
+            let metrics = installed_extensions::collect()
+                .into_iter()
+                .filter(|m| !m.get_metric().is_empty())
+                .collect::<Vec<MetricFamily>>();
+
             let encoder = TextEncoder::new();
-            encoder.encode(&metrics, &mut buffer).unwrap();
+            let mut buffer = vec![];
+
+            if let Err(err) = encoder.encode(&metrics, &mut buffer) {
+                let msg = format!("error handling /metrics request: {err}");
+                error!(msg);
+                return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
+            }
 
             match Response::builder()
                 .status(StatusCode::OK)

From 1e05e3a6e2975a9cdc7099b45d946e70216cf2b8 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 22 Nov 2024 09:31:54 +0100
Subject: [PATCH 26/51] minor PostgreSQL update in benchmarking (#9845)

## Problem

in benchmarking.yml job pgvector we install postgres from deb packages.
After the minor postgres update the referenced packages no longer exist

[Failing job:
](https://github.com/neondatabase/neon/actions/runs/11965785323/job/33360391115#step:4:41)

## Summary of changes

Reference and install the updated packages.

[Successful job after this
fix](https://github.com/neondatabase/neon/actions/runs/11967959920/job/33366011934#step:4:45)
---
 .github/workflows/benchmarking.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index acea859b4d..2ad1ee0a42 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -558,12 +558,12 @@ jobs:
         arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
 
         cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.1-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.5-1.pgdg110+1_${arch}.deb"
-        dpkg -x libpq5_17.1-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.5-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb pg
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg110+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg110+1_${arch}.deb"
+        dpkg -x libpq5_17.2-1.pgdg110+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.6-1.pgdg110+1_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb pg
 
         mkdir -p /tmp/neon/pg_install/v16/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench

From 83b73fc24e9d1113479477f5e96a29ff1276f656 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Fri, 22 Nov 2024 10:06:00 +0100
Subject: [PATCH 27/51] Batch scrape workflows up to last 30 days and stop
 ad-hoc (#9846)

Comparing Batch and Ad-hoc collectors there is no big difference, just
we need scrape for longer duration to catch retries.
Dashboard with comparison:

https://neonprod.grafana.net/d/be3pjm7c9ne2oe/compare-ad-hoc-and-batch?orgId=1&from=1731345095814&to=1731946295814

I should anyway raise support case with Github relating to that,
meanwhile that should be working solution and should save us some cost,
so it worths to switch to Batch now.

Ref: https://github.com/neondatabase/cloud/issues/17503
---
 .../workflows/report-workflow-stats-batch.yml | 38 +++++++++++++----
 .github/workflows/report-workflow-stats.yml   | 41 -------------------
 2 files changed, 31 insertions(+), 48 deletions(-)
 delete mode 100644 .github/workflows/report-workflow-stats.yml

diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml
index 98e394a3c2..2ed044b780 100644
--- a/.github/workflows/report-workflow-stats-batch.yml
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -4,10 +4,12 @@ on:
   schedule:
     - cron: '*/15 * * * *'
     - cron: '25 0 * * *'
+    - cron: '25 1 * * 6'
 
 jobs:
-  gh-workflow-stats-batch:
-    name: GitHub Workflow Stats Batch
+  gh-workflow-stats-batch-2h:
+    name: GitHub Workflow Stats Batch 2 hours
+    if: github.event.schedule == '*/15 * * * *'
     runs-on: ubuntu-22.04
     permissions:
       actions: read
@@ -16,14 +18,36 @@ jobs:
       uses: neondatabase/gh-workflow-stats-action@v0.2.1
       with:
         db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
-        db_table: "gh_workflow_stats_batch_neon"
+        db_table: "gh_workflow_stats_neon"
         gh_token: ${{ secrets.GITHUB_TOKEN }}
         duration: '2h'
-    - name: Export Workflow Run for the past 24 hours
-      if: github.event.schedule == '25 0 * * *'
+
+  gh-workflow-stats-batch-48h:
+    name: GitHub Workflow Stats Batch 48 hours
+    if: github.event.schedule == '25 0 * * *'
+    runs-on: ubuntu-22.04
+    permissions:
+      actions: read
+    steps:
+    - name: Export Workflow Run for the past 48 hours
       uses: neondatabase/gh-workflow-stats-action@v0.2.1
       with:
         db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
-        db_table: "gh_workflow_stats_batch_neon"
+        db_table: "gh_workflow_stats_neon"
         gh_token: ${{ secrets.GITHUB_TOKEN }}
-        duration: '24h'
+        duration: '48h'
+
+  gh-workflow-stats-batch-30d:
+    name: GitHub Workflow Stats Batch 30 days
+    if: github.event.schedule == '25 1 * * 6'
+    runs-on: ubuntu-22.04
+    permissions:
+      actions: read
+    steps:
+    - name: Export Workflow Run for the past 30 days
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '720h'
diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml
deleted file mode 100644
index 15e446bcd7..0000000000
--- a/.github/workflows/report-workflow-stats.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: Report Workflow Stats
-
-on:
-  workflow_run:
-    workflows:
-    - Add `external` label to issues and PRs created by external users
-    - Benchmarking
-    - Build and Test
-    - Build and Test Locally
-    - Build build-tools image
-    - Check Permissions
-    - Check neon with extra platform builds
-    - Cloud Regression Test
-    - Create Release Branch
-    - Handle `approved-for-ci-run` label
-    - Lint GitHub Workflows
-    - Notify Slack channel about upcoming release
-    - Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
-    - Pin build-tools image
-    - Prepare benchmarking databases by restoring dumps
-    - Push images to ACR
-    - Test Postgres client libraries
-    - Trigger E2E Tests
-    - cleanup caches by a branch
-    - Pre-merge checks
-    types: [completed]
-
-jobs:
-  gh-workflow-stats:
-    name: Github Workflow Stats
-    runs-on: ubuntu-22.04
-    permissions:
-      actions: read
-    steps:
-    - name: Export GH Workflow Stats
-      uses: neondatabase/gh-workflow-stats-action@v0.1.4
-      with:
-        DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
-        DB_TABLE: "gh_workflow_stats_neon"
-        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GH_RUN_ID: ${{ github.event.workflow_run.id }}

From d9de65ee8f5cdaf50bcaa0070dc47e95670aa223 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Nov 2024 09:24:23 +0000
Subject: [PATCH 28/51] pageserver: permit reads behind GC cutoff during LSN
 grace period (#9833)

## Problem

In https://github.com/neondatabase/neon/issues/9754 and the flakiness of
`test_readonly_node_gc`, we saw that although our logic for controlling
GC was sound, the validation of getpage requests was not, because it
could not consider LSN leases when requests arrived shortly after
restart.

Closes https://github.com/neondatabase/neon/issues/9754

## Summary of changes

This is the "Option 3" discussed verbally -- rather than holding back gc
cutoff, we waive the usual validation of request LSN if we are still
waiting for leases to be sent after startup

- When validating LSN in `wait_or_get_last_lsn`, skip the validation
relative to GC cutoff if the timeline is still in its LSN lease grace
period
- Re-enable test_readonly_node_gc
---
 pageserver/src/page_service.rs            | 23 ++++++++++++++---------
 pageserver/src/tenant/timeline.rs         |  5 +++++
 test_runner/regress/test_readonly_node.py |  1 -
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a429dff1fd..5fd02d8749 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1068,21 +1068,26 @@ impl PageServerHandler {
             ));
         }
 
-        if request_lsn < **latest_gc_cutoff_lsn {
+        // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
+        if request_lsn == Lsn::INVALID {
+            return Err(PageStreamError::BadRequest(
+                "invalid LSN(0) in request".into(),
+            ));
+        }
+
+        // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
+        //
+        // We may have older data available, but we make a best effort to detect this case and return an error,
+        // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
+        if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
             let gc_info = &timeline.gc_info.read().unwrap();
             if !gc_info.leases.contains_key(&request_lsn) {
-                // The requested LSN is below gc cutoff and is not guarded by a lease.
-
-                // Check explicitly for INVALID just to get a less scary error message if the
-                // request is obviously bogus
-                return Err(if request_lsn == Lsn::INVALID {
-                    PageStreamError::BadRequest("invalid LSN(0) in request".into())
-                } else {
+                return Err(
                     PageStreamError::BadRequest(format!(
                         "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
                         request_lsn, **latest_gc_cutoff_lsn
                     ).into())
-                });
+                );
             }
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 95864af4d0..0c7f3204f6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2085,6 +2085,11 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
     }
 
+    pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf.is_gc_blocked_by_lsn_lease_deadline()
+    }
+
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 2b274003a3..fcebf8d23a 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -121,7 +121,6 @@ def test_readonly_node(neon_simple_env: NeonEnv):
         )
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/9754")
 def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
     """
     Test static endpoint is protected from GC by acquiring and renewing lsn leases.

From 7372312a73067b1cd8c4945a22ea838a49d8d5d4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 22 Nov 2024 15:29:49 +0200
Subject: [PATCH 29/51] Avoid unnecessary send_replace calls in seqwait (#9852)

The notifications need to be sent whenever the waiters heap changes, per
the comment in `update_status`. But if 'advance' is called when there
are no waiters, or the new LSN is lower than the waiters so that no one
needs to be woken up, there's no need to send notifications. This saves
some CPU cycles in the common case that there are no waiters.
---
 libs/utils/src/seqwait.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index 375b227b99..d99dc25769 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -83,7 +83,9 @@ where
             }
             wake_these.push(self.heap.pop().unwrap().wake_channel);
         }
-        self.update_status();
+        if !wake_these.is_empty() {
+            self.update_status();
+        }
         wake_these
     }
 

From c10b7f7de9d2ccf41a8be2886ca1c00b828a3fd7 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 22 Nov 2024 07:37:06 -0600
Subject: [PATCH 30/51] Write a newline after adding dynamic_shared_memory_type
 to PG conf (#9843)

Without adding a newline, we can end up with a conf line that looks like
the following:

dynamic_shared_memory_type = mmap# Managed by compute_ctl: begin

This leads to Postgres logging:

LOG: configuration file
"/var/db/postgres/compute/pgdata/postgresql.conf" contains errors;
unaffected changes were applied

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/config.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index d4e413034e..d65fe73194 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -116,7 +116,7 @@ pub fn write_postgres_conf(
                 vartype: "enum".to_owned(),
             };
 
-            write!(file, "{}", opt.to_pg_setting())?;
+            writeln!(file, "{}", opt.to_pg_setting())?;
         }
     }
 

From 51d26a261bea9318a6eba43d53afff965c6f12a6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 22 Nov 2024 14:31:36 +0000
Subject: [PATCH 31/51] build(deps): bump mypy from 1.3.0 to 1.13.0 (#9670)

## Problem
We use a pretty old version of `mypy` 1.3 (released 1.5 years ago), it
produces false positives for `typing.Self`.

## Summary of changes
- Bump `mypy` from 1.3 to 1.13
- Fix new warnings and errors
- Use `typing.Self` whenever we `return self`
---
 poetry.lock                                   | 69 ++++++++++---------
 pyproject.toml                                |  2 +-
 scripts/force_layer_download.py               |  6 +-
 test_runner/fixtures/common_types.py          |  6 +-
 test_runner/fixtures/compute_reconfigure.py   |  2 +-
 test_runner/fixtures/neon_cli.py              |  3 -
 test_runner/fixtures/neon_fixtures.py         | 67 +++++++++---------
 test_runner/fixtures/parametrize.py           |  1 +
 test_runner/fixtures/workload.py              |  2 +-
 test_runner/regress/test_compute_metrics.py   |  7 +-
 test_runner/regress/test_ddl_forwarding.py    |  4 +-
 .../regress/test_pageserver_layer_rolling.py  | 12 ++--
 .../regress/test_pageserver_secondary.py      |  2 +-
 test_runner/regress/test_sharding.py          |  2 +-
 test_runner/regress/test_sni_router.py        | 10 ++-
 .../regress/test_storage_controller.py        |  4 +-
 test_runner/regress/test_wal_acceptor.py      |  6 +-
 17 files changed, 103 insertions(+), 102 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e38fc15eb7..e2fca7be47 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1858,47 +1858,54 @@ files = [
 
 [[package]]
 name = "mypy"
-version = "1.3.0"
+version = "1.13.0"
 description = "Optional static typing for Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"},
-    {file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"},
-    {file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"},
-    {file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"},
-    {file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"},
-    {file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"},
-    {file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"},
-    {file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"},
-    {file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"},
-    {file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"},
-    {file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"},
-    {file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"},
-    {file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"},
-    {file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"},
-    {file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"},
-    {file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"},
-    {file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"},
-    {file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"},
-    {file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"},
-    {file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"},
-    {file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"},
-    {file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"},
-    {file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"},
-    {file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"},
-    {file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"},
-    {file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"},
+    {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"},
+    {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"},
+    {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"},
+    {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"},
+    {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"},
+    {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"},
+    {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"},
+    {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"},
+    {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"},
+    {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"},
+    {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"},
+    {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"},
+    {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"},
+    {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"},
+    {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"},
+    {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"},
+    {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"},
+    {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"},
+    {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"},
+    {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"},
+    {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"},
+    {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"},
+    {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"},
+    {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"},
+    {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"},
+    {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"},
+    {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"},
+    {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"},
+    {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"},
+    {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"},
+    {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"},
+    {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"},
 ]
 
 [package.dependencies]
 mypy-extensions = ">=1.0.0"
-typing-extensions = ">=3.10"
+typing-extensions = ">=4.6.0"
 
 [package.extras]
 dmypy = ["psutil (>=4.0)"]
+faster-cache = ["orjson"]
 install-types = ["pip"]
-python2 = ["typed-ast (>=1.4.0,<2)"]
+mypyc = ["setuptools (>=50)"]
 reports = ["lxml"]
 
 [[package]]
@@ -3517,4 +3524,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "5a9b8c8d409acb840c0a94dcdec6aac9777ccec443d74c78dbd511fa223cd6f6"
+content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486"
diff --git a/pyproject.toml b/pyproject.toml
index 60c6839bc7..ccd3ab1864 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ testcontainers = "^4.8.1"
 jsonnet = "^0.20.0"
 
 [tool.poetry.group.dev.dependencies]
-mypy = "==1.3.0"
+mypy = "==1.13.0"
 ruff = "^0.7.0"
 
 [build-system]
diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py
index 6dbac08f3c..835e28c5d6 100644
--- a/scripts/force_layer_download.py
+++ b/scripts/force_layer_download.py
@@ -194,9 +194,11 @@ async def main_impl(args, report_out, client: Client):
             tenant_ids = await client.get_tenant_ids()
             get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids]
             gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True)
-            assert len(tenant_ids) == len(gathered)
             tenant_and_timline_ids = []
-            for tid, tlids in zip(tenant_ids, gathered, strict=False):
+            for tid, tlids in zip(tenant_ids, gathered, strict=True):
+                # TODO: add error handling if tlids isinstance(Exception)
+                assert isinstance(tlids, list)
+
                 for tlid in tlids:
                     tenant_and_timline_ids.append((tid, tlid))
         elif len(comps) == 1:
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 212ed9207f..c73d5411fa 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -190,10 +190,6 @@ class TenantTimelineId:
         )
 
 
-# Workaround for compat with python 3.9, which does not have `typing.Self`
-TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
-
-
 class TenantShardId:
     def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
         self.tenant_id = tenant_id
@@ -202,7 +198,7 @@ class TenantShardId:
         assert self.shard_number < self.shard_count or self.shard_count == 0
 
     @classmethod
-    def parse(cls: type[TTenantShardId], input: str) -> TTenantShardId:
+    def parse(cls: type[TenantShardId], input: str) -> TenantShardId:
         if len(input) == 32:
             return cls(
                 tenant_id=TenantId(input),
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index 4175f67ecb..33f01f80fb 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -69,7 +69,7 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer):
             # This causes the endpoint to query storage controller for its location, which
             # is redundant since we already have it here, but this avoids extending the
             # neon_local CLI to take full lists of locations
-            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[no-any-return]
+            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[misc]
 
         return Response(status=200)
 
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 03a02f51fd..a85a191455 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -20,12 +20,9 @@ from fixtures.pg_version import PgVersion
 if TYPE_CHECKING:
     from typing import (
         Any,
-        TypeVar,
         cast,
     )
 
-    T = TypeVar("T")
-
 
 # Used to be an ABC. abc.ABC removed due to linter without name change.
 class AbstractNeonCli:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 195b788c7e..e04cadf46f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -102,10 +102,7 @@ from .neon_api import NeonAPI, NeonApiEndpoint
 
 if TYPE_CHECKING:
     from collections.abc import Callable
-    from typing import (
-        Any,
-        TypeVar,
-    )
+    from typing import Any, Self, TypeVar
 
     from fixtures.paths import SnapshotDirLocked
 
@@ -838,7 +835,7 @@ class NeonEnvBuilder:
             if isinstance(x, S3Storage):
                 x.do_cleanup()
 
-    def __enter__(self) -> NeonEnvBuilder:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -1148,21 +1145,19 @@ class NeonEnv:
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=2 + len(self.pageservers) + len(self.safekeepers)
         ) as executor:
-            futs.append(
-                executor.submit(lambda: self.broker.start() or None)
-            )  # The `or None` is for the linter
+            futs.append(executor.submit(lambda: self.broker.start()))
 
             for pageserver in self.pageservers:
                 futs.append(
                     executor.submit(
-                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
+                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)  # type: ignore[misc]
                     )
                 )
 
             for safekeeper in self.safekeepers:
                 futs.append(
                     executor.submit(
-                        lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
+                        lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)  # type: ignore[misc]
                     )
                 )
 
@@ -1602,13 +1597,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
         timeout_in_seconds: int | None = None,
         instance_id: int | None = None,
         base_port: int | None = None,
-    ):
+    ) -> Self:
         assert not self.running
         self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
         self.running = True
         return self
 
-    def stop(self, immediate: bool = False) -> NeonStorageController:
+    def stop(self, immediate: bool = False) -> Self:
         if self.running:
             self.env.neon_cli.storage_controller_stop(immediate)
             self.running = False
@@ -2282,7 +2277,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
 
-    def __enter__(self) -> NeonStorageController:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -2304,7 +2299,7 @@ class NeonProxiedStorageController(NeonStorageController):
         timeout_in_seconds: int | None = None,
         instance_id: int | None = None,
         base_port: int | None = None,
-    ):
+    ) -> Self:
         assert instance_id is not None and base_port is not None
 
         self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
@@ -2324,7 +2319,7 @@ class NeonProxiedStorageController(NeonStorageController):
         self.running = any(meta["running"] for meta in self.instances.values())
         return self
 
-    def stop(self, immediate: bool = False) -> NeonStorageController:
+    def stop(self, immediate: bool = False) -> Self:
         for iid, details in self.instances.items():
             if details["running"]:
                 self.env.neon_cli.storage_controller_stop(immediate, iid)
@@ -2446,7 +2441,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         self,
         extra_env_vars: dict[str, str] | None = None,
         timeout_in_seconds: int | None = None,
-    ) -> NeonPageserver:
+    ) -> Self:
         """
         Start the page server.
         `overrides` allows to add some config to this pageserver start.
@@ -2481,7 +2476,7 @@ class NeonPageserver(PgProtocol, LogUtils):
 
         return self
 
-    def stop(self, immediate: bool = False) -> NeonPageserver:
+    def stop(self, immediate: bool = False) -> Self:
         """
         Stop the page server.
         Returns self.
@@ -2529,7 +2524,7 @@ class NeonPageserver(PgProtocol, LogUtils):
 
         wait_until(20, 0.5, complete)
 
-    def __enter__(self) -> NeonPageserver:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -2957,7 +2952,7 @@ class VanillaPostgres(PgProtocol):
         """Return size of pgdatadir subdirectory in bytes."""
         return get_dir_size(self.pgdatadir / subdir)
 
-    def __enter__(self) -> VanillaPostgres:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -3006,7 +3001,7 @@ class RemotePostgres(PgProtocol):
         # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE
         raise Exception("cannot get size of a Postgres instance")
 
-    def __enter__(self) -> RemotePostgres:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -3220,7 +3215,7 @@ class NeonProxy(PgProtocol):
         self.http_timeout_seconds = 15
         self._popen: subprocess.Popen[bytes] | None = None
 
-    def start(self) -> NeonProxy:
+    def start(self) -> Self:
         assert self._popen is None
 
         # generate key of it doesn't exist
@@ -3348,7 +3343,7 @@ class NeonProxy(PgProtocol):
                 log.info(f"SUCCESS, found auth url: {line}")
                 return line
 
-    def __enter__(self) -> NeonProxy:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -3438,7 +3433,7 @@ class NeonAuthBroker:
         self.http_timeout_seconds = 15
         self._popen: subprocess.Popen[bytes] | None = None
 
-    def start(self) -> NeonAuthBroker:
+    def start(self) -> Self:
         assert self._popen is None
 
         # generate key of it doesn't exist
@@ -3507,7 +3502,7 @@ class NeonAuthBroker:
         request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
         return request_result.text
 
-    def __enter__(self) -> NeonAuthBroker:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -3704,7 +3699,7 @@ class Endpoint(PgProtocol, LogUtils):
         config_lines: list[str] | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
-    ) -> Endpoint:
+    ) -> Self:
         """
         Create a new Postgres endpoint.
         Returns self.
@@ -3750,7 +3745,7 @@ class Endpoint(PgProtocol, LogUtils):
         safekeepers: list[int] | None = None,
         allow_multiple: bool = False,
         basebackup_request_tries: int | None = None,
-    ) -> Endpoint:
+    ) -> Self:
         """
         Start the Postgres instance.
         Returns self.
@@ -3797,7 +3792,7 @@ class Endpoint(PgProtocol, LogUtils):
         """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
         return self.endpoint_path() / "postgresql.conf"
 
-    def config(self, lines: list[str]) -> Endpoint:
+    def config(self, lines: list[str]) -> Self:
         """
         Add lines to postgresql.conf.
         Lines should be an array of valid postgresql.conf rows.
@@ -3873,7 +3868,7 @@ class Endpoint(PgProtocol, LogUtils):
         self,
         mode: str = "fast",
         sks_wait_walreceiver_gone: tuple[list[Safekeeper], TimelineId] | None = None,
-    ) -> Endpoint:
+    ) -> Self:
         """
         Stop the Postgres instance if it's running.
 
@@ -3907,7 +3902,7 @@ class Endpoint(PgProtocol, LogUtils):
 
         return self
 
-    def stop_and_destroy(self, mode: str = "immediate") -> Endpoint:
+    def stop_and_destroy(self, mode: str = "immediate") -> Self:
         """
         Stop the Postgres instance, then destroy the endpoint.
         Returns self.
@@ -3934,7 +3929,7 @@ class Endpoint(PgProtocol, LogUtils):
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         basebackup_request_tries: int | None = None,
-    ) -> Endpoint:
+    ) -> Self:
         """
         Create an endpoint, apply config, and start Postgres.
         Returns self.
@@ -3957,7 +3952,7 @@ class Endpoint(PgProtocol, LogUtils):
 
         return self
 
-    def __enter__(self) -> Endpoint:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
@@ -4058,7 +4053,7 @@ class EndpointFactory:
             pageserver_id=pageserver_id,
         )
 
-    def stop_all(self, fail_on_error=True) -> EndpointFactory:
+    def stop_all(self, fail_on_error=True) -> Self:
         exception = None
         for ep in self.endpoints:
             try:
@@ -4154,7 +4149,7 @@ class Safekeeper(LogUtils):
 
     def start(
         self, extra_opts: list[str] | None = None, timeout_in_seconds: int | None = None
-    ) -> Safekeeper:
+    ) -> Self:
         if extra_opts is None:
             # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two.
             extra_opts = self.extra_opts
@@ -4189,7 +4184,7 @@ class Safekeeper(LogUtils):
                 break  # success
         return self
 
-    def stop(self, immediate: bool = False) -> Safekeeper:
+    def stop(self, immediate: bool = False) -> Self:
         self.env.neon_cli.safekeeper_stop(self.id, immediate)
         self.running = False
         return self
@@ -4367,13 +4362,13 @@ class NeonBroker(LogUtils):
     def start(
         self,
         timeout_in_seconds: int | None = None,
-    ):
+    ) -> Self:
         assert not self.running
         self.env.neon_cli.storage_broker_start(timeout_in_seconds)
         self.running = True
         return self
 
-    def stop(self):
+    def stop(self) -> Self:
         if self.running:
             self.env.neon_cli.storage_broker_stop()
             self.running = False
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 0286b4f036..2c6adb8a33 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -66,6 +66,7 @@ def pytest_generate_tests(metafunc: Metafunc):
 
     metafunc.parametrize("build_type", build_types)
 
+    pg_versions: list[PgVersion]
     if (v := os.getenv("DEFAULT_PG_VERSION")) is None:
         pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
     else:
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 639e60914a..72dc102538 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -53,7 +53,7 @@ class Workload:
         self._endpoint: Endpoint | None = None
         self._endpoint_opts = endpoint_opts or {}
 
-    def reconfigure(self):
+    def reconfigure(self) -> None:
         """
         Request the endpoint to reconfigure based on location reported by storage controller
         """
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index c785036292..1b15c5f15e 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -17,7 +17,7 @@ from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
 
 if TYPE_CHECKING:
     from types import TracebackType
-    from typing import TypedDict
+    from typing import Self, TypedDict
 
     from fixtures.neon_fixtures import NeonEnv
     from fixtures.pg_version import PgVersion
@@ -185,7 +185,7 @@ class SqlExporterRunner:
     def stop(self) -> None:
         raise NotImplementedError()
 
-    def __enter__(self) -> SqlExporterRunner:
+    def __enter__(self) -> Self:
         self.start()
 
         return self
@@ -242,8 +242,7 @@ if SQL_EXPORTER is None:
             self.with_volume_mapping(str(config_file), container_config_file, "z")
             self.with_volume_mapping(str(collector_file), container_collector_file, "z")
 
-        @override
-        def start(self) -> SqlExporterContainer:
+        def start(self) -> Self:
             super().start()
 
             log.info("Waiting for sql_exporter to be ready")
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 8fb74f46e4..1c5554c379 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -13,7 +13,7 @@ from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
 if TYPE_CHECKING:
-    from typing import Any
+    from typing import Any, Self
 
 
 def handle_db(dbs, roles, operation):
@@ -91,7 +91,7 @@ class DdlForwardingContext:
             lambda request: ddl_forward_handler(request, self.dbs, self.roles, self)
         )
 
-    def __enter__(self):
+    def __enter__(self) -> Self:
         self.pg.start()
         return self
 
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index 590354e9da..f6a7bfa1ad 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -131,7 +131,7 @@ def test_pageserver_small_inmemory_layers(
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
 
     ps_http_client = env.pageserver.http_client()
     total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
@@ -139,7 +139,7 @@ def test_pageserver_small_inmemory_layers(
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))  # type: ignore
+    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
 
     # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
     # must be uploaded to remain visible to the pageserver after restart.
@@ -180,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
 
     # Stop the safekeepers, so that we cannot have any more WAL receiver connections
     for sk in env.safekeepers:
@@ -193,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))  # type: ignore
+    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
 
     # The code below verifies that we do not flush on the first write
     # after an idle period longer than the checkpoint timeout.
@@ -210,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
         run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
     )
 
-    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
 
     # We shouldn't flush since we've just opened a new layer
     waited_for = 0
@@ -312,4 +312,4 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
         dirty_bytes = get_dirty_bytes(env)
         assert dirty_bytes < max_dirty_data
 
-    wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited())  # type: ignore
+    wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited())
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index de0344bc29..a264f4d3c9 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -702,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         else:
             timeout = int(deadline - now) + 1
             try:
-                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))  # type: ignore
+                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))
             except:
                 log.error(f"Timed out waiting for '{expression}'")
                 raise
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 16bfa83b43..411574bd86 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1405,7 +1405,7 @@ def test_sharding_split_failures(
         #   e.g. while waiting for a storage controller to re-attach a parent shard if we failed
         #   inside the pageserver and the storage controller responds by detaching children and attaching
         #   parents concurrently (https://github.com/neondatabase/neon/issues/7148)
-        wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False))  # type: ignore
+        wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False))
 
         workload.validate()
 
diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py
index ef9974a15d..2a26fef59a 100644
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -3,13 +3,17 @@ from __future__ import annotations
 import socket
 import subprocess
 from pathlib import Path
-from types import TracebackType
+from typing import TYPE_CHECKING
 
 import backoff
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import PgProtocol, VanillaPostgres
 from fixtures.port_distributor import PortDistributor
 
+if TYPE_CHECKING:
+    from types import TracebackType
+    from typing import Self
+
 
 def generate_tls_cert(cn, certout, keyout):
     subprocess.run(
@@ -54,7 +58,7 @@ class PgSniRouter(PgProtocol):
         self._popen: subprocess.Popen[bytes] | None = None
         self.test_output_dir = test_output_dir
 
-    def start(self) -> PgSniRouter:
+    def start(self) -> Self:
         assert self._popen is None
         args = [
             str(self.neon_binpath / "pg_sni_router"),
@@ -87,7 +91,7 @@ class PgSniRouter(PgProtocol):
         if self._popen:
             self._popen.wait(timeout=2)
 
-    def __enter__(self) -> PgSniRouter:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index dbddc55823..13bc54a114 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2494,14 +2494,14 @@ def start_env(env: NeonEnv, storage_controller_port: int):
         for pageserver in env.pageservers:
             futs.append(
                 executor.submit(
-                    lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
+                    lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)  # type: ignore[misc]
                 )
             )
 
         for safekeeper in env.safekeepers:
             futs.append(
                 executor.submit(
-                    lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
+                    lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)  # type: ignore[misc]
                 )
             )
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 4c404cd881..405f15e488 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -61,7 +61,7 @@ from fixtures.utils import (
 )
 
 if TYPE_CHECKING:
-    from typing import Any
+    from typing import Any, Self
 
 
 def wait_lsn_force_checkpoint(
@@ -1460,7 +1460,7 @@ class SafekeeperEnv:
         self.tenant_id: TenantId | None = None
         self.timeline_id: TimelineId | None = None
 
-    def init(self) -> SafekeeperEnv:
+    def init(self) -> Self:
         assert self.postgres is None, "postgres is already initialized"
         assert self.safekeepers is None, "safekeepers are already initialized"
 
@@ -1541,7 +1541,7 @@ class SafekeeperEnv:
             log.info(f"Killing safekeeper with pid {pid}")
             os.kill(pid, signal.SIGKILL)
 
-    def __enter__(self):
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):

From 8ab96cc71fc5a5a862afd7a27d429469c58e5027 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 22 Nov 2024 14:51:32 +0000
Subject: [PATCH 32/51] chore(proxy/jwks): reduce the rightward drift of jwks
 renewal (#9853)

I found the rightward drift of the `renew_jwks` function hard to review.

This PR splits out some major logic and uses early returns to make the
happy path more linear.
---
 proxy/src/auth/backend/jwt.rs | 177 ++++++++++++++++++----------------
 1 file changed, 96 insertions(+), 81 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index f721d81aa2..517d4fd34b 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -132,6 +132,93 @@ struct JwkSet<'a> {
     keys: Vec<&'a RawValue>,
 }
 
+/// Given a jwks_url, fetch the JWKS and parse out all the signing JWKs.
+/// Returns `None` and log a warning if there are any errors.
+async fn fetch_jwks(
+    client: &reqwest_middleware::ClientWithMiddleware,
+    jwks_url: url::Url,
+) -> Option<jose_jwk::JwkSet> {
+    let req = client.get(jwks_url.clone());
+    // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
+    let resp = req.send().await.and_then(|r| {
+        r.error_for_status()
+            .map_err(reqwest_middleware::Error::Reqwest)
+    });
+
+    let resp = match resp {
+        Ok(r) => r,
+        // TODO: should we re-insert JWKs if we want to keep this JWKs URL?
+        // I expect these failures would be quite sparse.
+        Err(e) => {
+            tracing::warn!(url=?jwks_url, error=?e, "could not fetch JWKs");
+            return None;
+        }
+    };
+
+    let resp: http::Response<reqwest::Body> = resp.into();
+
+    let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE).await {
+        Ok(bytes) => bytes,
+        Err(e) => {
+            tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs");
+            return None;
+        }
+    };
+
+    let jwks = match serde_json::from_slice::<JwkSet>(&bytes) {
+        Ok(jwks) => jwks,
+        Err(e) => {
+            tracing::warn!(url=?jwks_url, error=?e, "could not decode JWKs");
+            return None;
+        }
+    };
+
+    // `jose_jwk::Jwk` is quite large (288 bytes). Let's not pre-allocate for what we don't need.
+    //
+    // Even though we limit our responses to 64KiB, we could still receive a payload like
+    // `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`. Parsing this as `RawValue` uses 468KiB.
+    // Pre-allocating the corresponding `Vec::<jose_jwk::Jwk>::with_capacity(30000)` uses 8.2MiB.
+    let mut keys = vec![];
+
+    let mut failed = 0;
+    for key in jwks.keys {
+        let key = match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
+            Ok(key) => key,
+            Err(e) => {
+                tracing::debug!(url=?jwks_url, failed=?e, "could not decode JWK");
+                failed += 1;
+                continue;
+            }
+        };
+
+        // if `use` (called `cls` in rust) is specified to be something other than signing,
+        // we can skip storing it.
+        if key
+            .prm
+            .cls
+            .as_ref()
+            .is_some_and(|c| *c != jose_jwk::Class::Signing)
+        {
+            continue;
+        }
+
+        keys.push(key);
+    }
+
+    keys.shrink_to_fit();
+
+    if failed > 0 {
+        tracing::warn!(url=?jwks_url, failed, "could not decode JWKs");
+    }
+
+    if keys.is_empty() {
+        tracing::warn!(url=?jwks_url, "no valid JWKs found inside the response body");
+        return None;
+    }
+
+    Some(jose_jwk::JwkSet { keys })
+}
+
 impl JwkCacheEntryLock {
     async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
         JwkRenewalPermit::acquire_permit(self).await
@@ -166,87 +253,15 @@ impl JwkCacheEntryLock {
         // TODO(conrad): run concurrently
         // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
         for rule in rules {
-            let req = client.get(rule.jwks_url.clone());
-            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
-            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
-            match req.send().await.and_then(|r| {
-                r.error_for_status()
-                    .map_err(reqwest_middleware::Error::Reqwest)
-            }) {
-                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
-                // I expect these failures would be quite sparse.
-                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
-                Ok(r) => {
-                    let resp: http::Response<reqwest::Body> = r.into();
-
-                    let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE)
-                        .await
-                    {
-                        Ok(bytes) => bytes,
-                        Err(e) => {
-                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
-                            continue;
-                        }
-                    };
-
-                    match serde_json::from_slice::<JwkSet>(&bytes) {
-                        Err(e) => {
-                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
-                        }
-                        Ok(jwks) => {
-                            // size_of::<&RawValue>() == 16
-                            // size_of::<jose_jwk::Jwk>() == 288
-                            // better to not pre-allocate this as it might be pretty large - especially if it has many
-                            // keys we don't want or need.
-                            // trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`
-                            // this would consume 8MiB just like that!
-                            let mut keys = vec![];
-                            let mut failed = 0;
-                            for key in jwks.keys {
-                                match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
-                                    Ok(key) => {
-                                        // if `use` (called `cls` in rust) is specified to be something other than signing,
-                                        // we can skip storing it.
-                                        if key
-                                            .prm
-                                            .cls
-                                            .as_ref()
-                                            .is_some_and(|c| *c != jose_jwk::Class::Signing)
-                                        {
-                                            continue;
-                                        }
-
-                                        keys.push(key);
-                                    }
-                                    Err(e) => {
-                                        tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK");
-                                        failed += 1;
-                                    }
-                                }
-                            }
-                            keys.shrink_to_fit();
-
-                            if failed > 0 {
-                                tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs");
-                            }
-
-                            if keys.is_empty() {
-                                tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body");
-                                continue;
-                            }
-
-                            let jwks = jose_jwk::JwkSet { keys };
-                            key_sets.insert(
-                                rule.id,
-                                KeySet {
-                                    jwks,
-                                    audience: rule.audience,
-                                    role_names: rule.role_names,
-                                },
-                            );
-                        }
-                    };
-                }
+            if let Some(jwks) = fetch_jwks(client, rule.jwks_url).await {
+                key_sets.insert(
+                    rule.id,
+                    KeySet {
+                        jwks,
+                        audience: rule.audience,
+                        role_names: rule.role_names,
+                    },
+                );
             }
         }
 

From b3b579b45ee013657fce065c9df9a47ff0085608 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 22 Nov 2024 16:13:53 +0000
Subject: [PATCH 33/51] test_bulk_insert: fix typing for PgVersion (#9854)

## Problem

Along with the migration to Python 3.11, I switched `C(str, Enum)` with
`C(StrEnum)`; one such example is the `PgVersion` enum.
It required more changes in `PgVersion` itself (before, it accepted both
`str` and `int`, and after it, it supports only `str`), which caused the
`test_bulk_insert` test to fail.

## Summary of changes
- `test_bulk_insert`: explicitly cast pg_version from `timeline_detail`
to str
---
 test_runner/performance/test_bulk_insert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 36090dcad7..680eb62b39 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -56,7 +56,7 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
 
 def measure_recovery_time(env: NeonCompare):
     client = env.env.pageserver.http_client()
-    pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"])
+    pg_version = PgVersion(str(client.timeline_detail(env.tenant, env.timeline)["pg_version"]))
 
     # Delete the Tenant in the pageserver: this will drop local and remote layers, such that
     # when we "create" the Tenant again, we will replay the WAL from the beginning.

From 3b1ac8b14a0f48c71780f3cb3d607ee7287093f7 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Fri, 22 Nov 2024 18:46:38 +0200
Subject: [PATCH 34/51] proxy: Implement cancellation rate limiting (#9739)

Implement cancellation rate limiting and ip allowlist checks. Add
ip_allowlist to the cancel closure

Fixes [#16456](https://github.com/neondatabase/cloud/issues/16456)
---
 Cargo.lock                                 |  4 +-
 Cargo.toml                                 |  2 +-
 proxy/src/auth/backend/console_redirect.rs | 20 +++--
 proxy/src/auth/backend/mod.rs              | 18 +----
 proxy/src/bin/proxy.rs                     |  3 +-
 proxy/src/cancellation.rs                  | 87 +++++++++++++++++++++-
 proxy/src/compute.rs                       |  2 +-
 proxy/src/console_redirect_proxy.rs        | 13 +++-
 proxy/src/metrics.rs                       |  1 +
 proxy/src/proxy/mod.rs                     |  8 +-
 proxy/src/rate_limiter/limiter.rs          | 19 ++++-
 proxy/src/rate_limiter/mod.rs              |  3 +-
 proxy/src/redis/cancellation_publisher.rs  | 27 +++++--
 proxy/src/redis/notifications.rs           | 15 +++-
 14 files changed, 173 insertions(+), 49 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b1232a6b6a..a25fa89c77 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2838,9 +2838,9 @@ dependencies = [
 
 [[package]]
 name = "ipnet"
-version = "2.9.0"
+version = "2.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
+checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
 
 [[package]]
 name = "is-terminal"
diff --git a/Cargo.toml b/Cargo.toml
index c6b4b62042..aac19a4122 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -106,7 +106,7 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
-ipnet = "2.9.0"
+ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
 jsonwebtoken = "9"
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 5772471486..bf7a1cb070 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -6,6 +6,7 @@ use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 use super::ComputeCredentialKeys;
+use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestContext;
@@ -74,10 +75,10 @@ impl ConsoleRedirectBackend {
         ctx: &RequestContext,
         auth_config: &'static AuthenticationConfig,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> auth::Result<ConsoleRedirectNodeInfo> {
+    ) -> auth::Result<(ConsoleRedirectNodeInfo, Option<Vec<IpPattern>>)> {
         authenticate(ctx, auth_config, &self.console_uri, client)
             .await
-            .map(ConsoleRedirectNodeInfo)
+            .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist))
     }
 }
 
@@ -102,7 +103,7 @@ async fn authenticate(
     auth_config: &'static AuthenticationConfig,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<NodeInfo> {
+) -> auth::Result<(NodeInfo, Option<Vec<IpPattern>>)> {
     ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
 
     // registering waiter can fail if we get unlucky with rng.
@@ -176,9 +177,12 @@ async fn authenticate(
         config.password(password.as_ref());
     }
 
-    Ok(NodeInfo {
-        config,
-        aux: db_info.aux,
-        allow_self_signed_compute: false, // caller may override
-    })
+    Ok((
+        NodeInfo {
+            config,
+            aux: db_info.aux,
+            allow_self_signed_compute: false, // caller may override
+        },
+        db_info.allowed_ips,
+    ))
 }
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 57ecd5e499..7e1b26a11a 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -6,7 +6,6 @@ pub mod local;
 
 use std::net::IpAddr;
 use std::sync::Arc;
-use std::time::Duration;
 
 pub use console_redirect::ConsoleRedirectBackend;
 pub(crate) use console_redirect::ConsoleRedirectError;
@@ -30,7 +29,7 @@ use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
-use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
+use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
 use crate::stream::Stream;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
 use crate::{scram, stream};
@@ -192,21 +191,6 @@ impl MaskedIp {
 // This can't be just per IP because that would limit some PaaS that share IP addresses
 pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
 
-impl RateBucketInfo {
-    /// All of these are per endpoint-maskedip pair.
-    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
-    ///
-    /// First bucket: 1000mcpus total per endpoint-ip pair
-    /// * 4096000 requests per second with 1 hash rounds.
-    /// * 1000 requests per second with 4096 hash rounds.
-    /// * 6.8 requests per second with 600000 hash rounds.
-    pub const DEFAULT_AUTH_SET: [Self; 3] = [
-        Self::new(1000 * 4096, Duration::from_secs(1)),
-        Self::new(600 * 4096, Duration::from_secs(60)),
-        Self::new(300 * 4096, Duration::from_secs(600)),
-    ];
-}
-
 impl AuthenticationConfig {
     pub(crate) fn check_rate_limit(
         &self,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 45fbe4a398..a935378162 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -428,8 +428,9 @@ async fn main() -> anyhow::Result<()> {
         )?))),
         None => None,
     };
+
     let cancellation_handler = Arc::new(CancellationHandler::<
-        Option<Arc<tokio::sync::Mutex<RedisPublisherClient>>>,
+        Option<Arc<Mutex<RedisPublisherClient>>>,
     >::new(
         cancel_map.clone(),
         redis_publisher,
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 3ad2d55b53..4b72a66e63 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -10,16 +10,23 @@ use tokio_postgres::{CancelToken, NoTls};
 use tracing::{debug, info};
 use uuid::Uuid;
 
+use crate::auth::{check_peer_addr_is_in_list, IpPattern};
 use crate::error::ReportableError;
 use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
+use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::cancellation_publisher::{
     CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
+use std::net::IpAddr;
+
+use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
 pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
 
+type IpSubnetKey = IpNet;
+
 /// Enables serving `CancelRequest`s.
 ///
 /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
@@ -29,14 +36,23 @@ pub struct CancellationHandler<P> {
     /// This field used for the monitoring purposes.
     /// Represents the source of the cancellation request.
     from: CancellationSource,
+    // rate limiter of cancellation requests
+    limiter: Arc<std::sync::Mutex<LeakyBucketRateLimiter<IpSubnetKey>>>,
 }
 
 #[derive(Debug, Error)]
 pub(crate) enum CancelError {
     #[error("{0}")]
     IO(#[from] std::io::Error),
+
     #[error("{0}")]
     Postgres(#[from] tokio_postgres::Error),
+
+    #[error("rate limit exceeded")]
+    RateLimit,
+
+    #[error("IP is not allowed")]
+    IpNotAllowed,
 }
 
 impl ReportableError for CancelError {
@@ -47,6 +63,8 @@ impl ReportableError for CancelError {
                 crate::error::ErrorKind::Postgres
             }
             CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
+            CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
+            CancelError::IpNotAllowed => crate::error::ErrorKind::User,
         }
     }
 }
@@ -79,13 +97,36 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             cancellation_handler: self,
         }
     }
+
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
+    /// check_allowed - if true, check if the IP is allowed to cancel the query
     pub(crate) async fn cancel_session(
         &self,
         key: CancelKeyData,
         session_id: Uuid,
+        peer_addr: &IpAddr,
+        check_allowed: bool,
     ) -> Result<(), CancelError> {
+        // TODO: check for unspecified address is only for backward compatibility, should be removed
+        if !peer_addr.is_unspecified() {
+            let subnet_key = match *peer_addr {
+                IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
+                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
+            };
+            if !self.limiter.lock().unwrap().check(subnet_key, 1) {
+                tracing::debug!("Rate limit exceeded. Skipping cancellation message");
+                Metrics::get()
+                    .proxy
+                    .cancellation_requests_total
+                    .inc(CancellationRequest {
+                        source: self.from,
+                        kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
+                    });
+                return Err(CancelError::RateLimit);
+            }
+        }
+
         // NB: we should immediately release the lock after cloning the token.
         let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
@@ -96,7 +137,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                     source: self.from,
                     kind: crate::metrics::CancellationOutcome::NotFound,
                 });
-            match self.client.try_publish(key, session_id).await {
+
+            if session_id == Uuid::nil() {
+                // was already published, do not publish it again
+                return Ok(());
+            }
+
+            match self.client.try_publish(key, session_id, *peer_addr).await {
                 Ok(()) => {} // do nothing
                 Err(e) => {
                     return Err(CancelError::IO(std::io::Error::new(
@@ -107,6 +154,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             }
             return Ok(());
         };
+
+        if check_allowed
+            && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice())
+        {
+            return Err(CancelError::IpNotAllowed);
+        }
+
         Metrics::get()
             .proxy
             .cancellation_requests_total
@@ -135,13 +189,29 @@ impl CancellationHandler<()> {
             map,
             client: (),
             from,
+            limiter: Arc::new(std::sync::Mutex::new(
+                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
+                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
+                    64,
+                ),
+            )),
         }
     }
 }
 
 impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
     pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
-        Self { map, client, from }
+        Self {
+            map,
+            client,
+            from,
+            limiter: Arc::new(std::sync::Mutex::new(
+                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
+                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
+                    64,
+                ),
+            )),
+        }
     }
 }
 
@@ -152,13 +222,19 @@ impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
 pub struct CancelClosure {
     socket_addr: SocketAddr,
     cancel_token: CancelToken,
+    ip_allowlist: Vec<IpPattern>,
 }
 
 impl CancelClosure {
-    pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
+    pub(crate) fn new(
+        socket_addr: SocketAddr,
+        cancel_token: CancelToken,
+        ip_allowlist: Vec<IpPattern>,
+    ) -> Self {
         Self {
             socket_addr,
             cancel_token,
+            ip_allowlist,
         }
     }
     /// Cancels the query running on user's compute node.
@@ -168,6 +244,9 @@ impl CancelClosure {
         debug!("query was cancelled");
         Ok(())
     }
+    pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec<IpPattern>) {
+        self.ip_allowlist = ip_allowlist;
+    }
 }
 
 /// Helper for registering query cancellation tokens.
@@ -229,6 +308,8 @@ mod tests {
                     cancel_key: 0,
                 },
                 Uuid::new_v4(),
+                &("127.0.0.1".parse().unwrap()),
+                true,
             )
             .await
             .unwrap();
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index e7fbe9ab47..8408d4720b 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -342,7 +342,7 @@ impl ConnCfg {
 
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
         // Yet another reason to rework the connection establishing code.
-        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
+        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]);
 
         let connection = PostgresConnection {
             stream,
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index c88b2936db..fbd0c8e5c5 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -156,16 +156,21 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
-
     let record_handshake_error = !ctx.has_private_peer_addr();
     let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
     let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
+
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
                 return Ok(cancellation_handler
-                    .cancel_session(cancel_key_data, ctx.session_id())
+                    .cancel_session(
+                        cancel_key_data,
+                        ctx.session_id(),
+                        &ctx.peer_addr(),
+                        config.authentication_config.ip_allowlist_check_enabled,
+                    )
                     .await
                     .map(|()| None)?)
             }
@@ -174,7 +179,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     ctx.set_db_options(params.clone());
 
-    let user_info = match backend
+    let (user_info, ip_allowlist) = match backend
         .authenticate(ctx, &config.authentication_config, &mut stream)
         .await
     {
@@ -198,6 +203,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
+    node.cancel_closure
+        .set_ip_allowlist(ip_allowlist.unwrap_or_default());
     let session = cancellation_handler.get_session();
     prepare_client_connection(&node, &session, &mut stream).await?;
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f91fcd4120..659c57c865 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -351,6 +351,7 @@ pub enum CancellationSource {
 pub enum CancellationOutcome {
     NotFound,
     Found,
+    RateLimitExceeded,
 }
 
 #[derive(LabelGroup)]
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 9415b54a4a..5d9468d89a 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -268,12 +268,18 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let record_handshake_error = !ctx.has_private_peer_addr();
     let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
     let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
+
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
                 return Ok(cancellation_handler
-                    .cancel_session(cancel_key_data, ctx.session_id())
+                    .cancel_session(
+                        cancel_key_data,
+                        ctx.session_id(),
+                        &ctx.peer_addr(),
+                        config.authentication_config.ip_allowlist_check_enabled,
+                    )
                     .await
                     .map(|()| None)?)
             }
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 4259fd04f4..a048721e77 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -14,13 +14,13 @@ use tracing::info;
 
 use crate::intern::EndpointIdInt;
 
-pub(crate) struct GlobalRateLimiter {
+pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
     info: Vec<RateBucketInfo>,
 }
 
 impl GlobalRateLimiter {
-    pub(crate) fn new(info: Vec<RateBucketInfo>) -> Self {
+    pub fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -34,7 +34,7 @@ impl GlobalRateLimiter {
     }
 
     /// Check that number of connections is below `max_rps` rps.
-    pub(crate) fn check(&mut self) -> bool {
+    pub fn check(&mut self) -> bool {
         let now = Instant::now();
 
         let should_allow_request = self
@@ -137,6 +137,19 @@ impl RateBucketInfo {
         Self::new(200, Duration::from_secs(600)),
     ];
 
+    /// All of these are per endpoint-maskedip pair.
+    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
+    ///
+    /// First bucket: 1000mcpus total per endpoint-ip pair
+    /// * 4096000 requests per second with 1 hash rounds.
+    /// * 1000 requests per second with 4096 hash rounds.
+    /// * 6.8 requests per second with 600000 hash rounds.
+    pub const DEFAULT_AUTH_SET: [Self; 3] = [
+        Self::new(1000 * 4096, Duration::from_secs(1)),
+        Self::new(600 * 4096, Duration::from_secs(60)),
+        Self::new(300 * 4096, Duration::from_secs(600)),
+    ];
+
     pub fn rps(&self) -> f64 {
         (self.max_rpi as f64) / self.interval.as_secs_f64()
     }
diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs
index 3ae2ecaf8f..5f90102da3 100644
--- a/proxy/src/rate_limiter/mod.rs
+++ b/proxy/src/rate_limiter/mod.rs
@@ -8,5 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd;
 pub(crate) use limit_algorithm::{
     DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
-pub(crate) use limiter::GlobalRateLimiter;
-pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
+pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 7392b0d316..633a2f1b81 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -1,5 +1,6 @@
 use std::sync::Arc;
 
+use core::net::IpAddr;
 use pq_proto::CancelKeyData;
 use redis::AsyncCommands;
 use tokio::sync::Mutex;
@@ -15,6 +16,7 @@ pub trait CancellationPublisherMut: Send + Sync + 'static {
         &mut self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()>;
 }
 
@@ -24,6 +26,7 @@ pub trait CancellationPublisher: Send + Sync + 'static {
         &self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()>;
 }
 
@@ -32,6 +35,7 @@ impl CancellationPublisher for () {
         &self,
         _cancel_key_data: CancelKeyData,
         _session_id: Uuid,
+        _peer_addr: IpAddr,
     ) -> anyhow::Result<()> {
         Ok(())
     }
@@ -42,8 +46,10 @@ impl<P: CancellationPublisher> CancellationPublisherMut for P {
         &mut self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()> {
-        <P as CancellationPublisher>::try_publish(self, cancel_key_data, session_id).await
+        <P as CancellationPublisher>::try_publish(self, cancel_key_data, session_id, peer_addr)
+            .await
     }
 }
 
@@ -52,9 +58,10 @@ impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
         &self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()> {
         if let Some(p) = self {
-            p.try_publish(cancel_key_data, session_id).await
+            p.try_publish(cancel_key_data, session_id, peer_addr).await
         } else {
             Ok(())
         }
@@ -66,10 +73,11 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
         &self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()> {
         self.lock()
             .await
-            .try_publish(cancel_key_data, session_id)
+            .try_publish(cancel_key_data, session_id, peer_addr)
             .await
     }
 }
@@ -97,11 +105,13 @@ impl RedisPublisherClient {
         &mut self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()> {
         let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
             region_id: Some(self.region_id.clone()),
             cancel_key_data,
             session_id,
+            peer_addr: Some(peer_addr),
         }))?;
         let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
         Ok(())
@@ -120,13 +130,14 @@ impl RedisPublisherClient {
         &mut self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()> {
         // TODO: review redundant error duplication logs.
         if !self.limiter.check() {
             tracing::info!("Rate limit exceeded. Skipping cancellation message");
             return Err(anyhow::anyhow!("Rate limit exceeded"));
         }
-        match self.publish(cancel_key_data, session_id).await {
+        match self.publish(cancel_key_data, session_id, peer_addr).await {
             Ok(()) => return Ok(()),
             Err(e) => {
                 tracing::error!("failed to publish a message: {e}");
@@ -134,7 +145,7 @@ impl RedisPublisherClient {
         }
         tracing::info!("Publisher is disconnected. Reconnectiong...");
         self.try_connect().await?;
-        self.publish(cancel_key_data, session_id).await
+        self.publish(cancel_key_data, session_id, peer_addr).await
     }
 }
 
@@ -143,9 +154,13 @@ impl CancellationPublisherMut for RedisPublisherClient {
         &mut self,
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<()> {
         tracing::info!("publishing cancellation key to Redis");
-        match self.try_publish_internal(cancel_key_data, session_id).await {
+        match self
+            .try_publish_internal(cancel_key_data, session_id, peer_addr)
+            .await
+        {
             Ok(()) => {
                 tracing::debug!("cancellation key successfuly published to Redis");
                 Ok(())
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 62e7b1b565..65008ae943 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -60,6 +60,7 @@ pub(crate) struct CancelSession {
     pub(crate) region_id: Option<String>,
     pub(crate) cancel_key_data: CancelKeyData,
     pub(crate) session_id: Uuid,
+    pub(crate) peer_addr: Option<std::net::IpAddr>,
 }
 
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
@@ -137,10 +138,20 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         return Ok(());
                     }
                 }
+
+                // TODO: Remove unspecified peer_addr after the complete migration to the new format
+                let peer_addr = cancel_session
+                    .peer_addr
+                    .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
                 // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
                 match self
                     .cancellation_handler
-                    .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil())
+                    .cancel_session(
+                        cancel_session.cancel_key_data,
+                        uuid::Uuid::nil(),
+                        &peer_addr,
+                        cancel_session.peer_addr.is_some(),
+                    )
                     .await
                 {
                     Ok(()) => {}
@@ -335,6 +346,7 @@ mod tests {
             cancel_key_data,
             region_id: None,
             session_id: uuid,
+            peer_addr: None,
         });
         let text = serde_json::to_string(&msg)?;
         let result: Notification = serde_json::from_str(&text)?;
@@ -344,6 +356,7 @@ mod tests {
             cancel_key_data,
             region_id: Some("region".to_string()),
             session_id: uuid,
+            peer_addr: None,
         });
         let text = serde_json::to_string(&msg)?;
         let result: Notification = serde_json::from_str(&text)?;

From 211e4174d2571c15b15904827484219aaa569bc6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 22 Nov 2024 12:50:00 -0500
Subject: [PATCH 35/51] fix(pageserver): preempt and retry azure list operation
 (#9840)

## Problem

close https://github.com/neondatabase/neon/issues/9836

Looking at Azure SDK, the only related issue I can find is
https://github.com/azure/azure-sdk-for-rust/issues/1549. Azure uses
reqwest as the backend, so I assume there's some underlying magic
unknown to us that might have caused the stuck in #9836.

The observation is:
* We didn't get an explicit out of resource HTTP error from Azure.
* The connection simply gets stuck and times out.
* But when we retry after we reach the timeout, it succeeds.

This issue is hard to identify -- maybe something went wrong at the ABS
side, or something wrong with our side. But we know that a retry will
usually succeed if we give up the stuck connection.

Therefore, I propose the fix that we preempt stuck HTTP operation and
actively retry. This would mitigate the problem, while in the long run,
we need to keep an eye on ABS usage and see if we can fully resolve this
problem.

The reasoning of such timeout mechanism: we use a much smaller timeout
than before to preempt, while it is possible that a normal listing
operation would take a longer time than the initial timeout if it
contains a lot of keys. Therefore, after we terminate the connection, we
should double the timeout, so that such requests would eventually
succeed.

## Summary of changes

* Use exponential growth for ABS list timeout.
* Rather than using a fixed timeout, use a timeout that starts small and
grows
* Rather than exposing timeouts to the list_streaming caller as soon as
we see them, only do so after we have retried a few times

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/remote_storage/src/azure_blob.rs | 61 ++++++++++++++++++---------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 1c0d43d479..ae0a94295c 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -24,6 +24,7 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
 use bytes::Bytes;
 use futures::future::Either;
 use futures::stream::Stream;
+use futures::FutureExt;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
@@ -31,6 +32,7 @@ use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use utils::backoff;
+use utils::backoff::exponential_backoff_duration_seconds;
 
 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
@@ -302,40 +304,59 @@ impl RemoteStorage for AzureBlobStorage {
 
             let mut next_marker = None;
 
+            let mut timeout_try_cnt = 1;
+
             'outer: loop {
                 let mut builder = builder.clone();
                 if let Some(marker) = next_marker.clone() {
                     builder = builder.marker(marker);
                 }
-                let response = builder.into_stream();
-                let response = response.into_stream().map_err(to_download_error);
-                let response = tokio_stream::StreamExt::timeout(response, self.timeout);
-                let response = response.map(|res| match res {
-                    Ok(res) => res,
-                    Err(_elapsed) => Err(DownloadError::Timeout),
+                // Azure Blob Rust SDK does not expose the list blob API directly. Users have to use
+                // their pageable iterator wrapper that returns all keys as a stream. We want to have
+                // full control of paging, and therefore we only take the first item from the stream.
+                let mut response_stream = builder.into_stream();
+                let response = response_stream.next();
+                // Timeout mechanism: Azure client will sometimes stuck on a request, but retrying that request
+                // would immediately succeed. Therefore, we use exponential backoff timeout to retry the request.
+                // (Usually, exponential backoff is used to determine the sleep time between two retries.) We
+                // start with 10.0 second timeout, and double the timeout for each failure, up to 5 failures.
+                // timeout = min(5 * (1.0+1.0)^n, self.timeout).
+                let this_timeout = (5.0 * exponential_backoff_duration_seconds(timeout_try_cnt, 1.0, self.timeout.as_secs_f64())).min(self.timeout.as_secs_f64());
+                let response = tokio::time::timeout(Duration::from_secs_f64(this_timeout), response);
+                let response = response.map(|res| {
+                    match res {
+                        Ok(Some(Ok(res))) => Ok(Some(res)),
+                        Ok(Some(Err(e)))  => Err(to_download_error(e)),
+                        Ok(None) => Ok(None),
+                        Err(_elasped) => Err(DownloadError::Timeout),
+                    }
                 });
-
-                let mut response = std::pin::pin!(response);
-
                 let mut max_keys = max_keys.map(|mk| mk.get());
                 let next_item = tokio::select! {
-                    op = response.next() => Ok(op),
+                    op = response => op,
                     _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-                }?;
+                };
+
+                if let Err(DownloadError::Timeout) = &next_item {
+                    timeout_try_cnt += 1;
+                    if timeout_try_cnt <= 5 {
+                        continue;
+                    }
+                }
+
+                let next_item = next_item?;
+
+                if timeout_try_cnt >= 2 {
+                    tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt);
+                }
+                timeout_try_cnt = 1;
+
                 let Some(entry) = next_item else {
                     // The list is complete, so yield it.
                     break;
                 };
 
                 let mut res = Listing::default();
-                let entry = match entry {
-                    Ok(entry) => entry,
-                    Err(e) => {
-                        // The error is potentially retryable, so we must rewind the loop after yielding.
-                        yield Err(e);
-                        continue;
-                    }
-                };
                 next_marker = entry.continuation();
                 let prefix_iter = entry
                     .blobs
@@ -351,7 +372,7 @@ impl RemoteStorage for AzureBlobStorage {
                         last_modified: k.properties.last_modified.into(),
                         size: k.properties.content_length,
                     }
-                    );
+                );
 
                 for key in blob_iter {
                     res.keys.push(key);

From e939d36dd4bbf6961f3eac9b4dc3e3ff82b62694 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 22 Nov 2024 18:50:33 +0100
Subject: [PATCH 36/51] safekeeper,pageserver: fix CPU profiling allowlists
 (#9856)

## Problem

The HTTP router allowlists matched both on the path and the query
string. This meant that only `/profile/cpu` would be allowed without
auth, while `/profile/cpu?format=svg` would require auth.

Follows #9764.

## Summary of changes

* Match allowlists on URI path, rather than the entire URI.
* Fix the allowlist for Safekeeper to use `/profile/cpu` rather than the
old `/pprof/profile`.
* Just use a constant slice for the allowlist; it's only a handful of
items, and these handlers are not on hot paths.
---
 pageserver/src/http/routes.rs | 11 ++++-------
 safekeeper/src/http/routes.rs | 15 ++++-----------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9bd1929b0b..7168850ed6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -126,7 +126,7 @@ pub struct State {
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
-    allowlist_routes: Vec<Uri>,
+    allowlist_routes: &'static [&'static str],
     remote_storage: GenericRemoteStorage,
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -147,16 +147,13 @@ impl State {
         deletion_queue_client: DeletionQueueClient,
         secondary_controller: SecondaryController,
     ) -> anyhow::Result<Self> {
-        let allowlist_routes = [
+        let allowlist_routes = &[
             "/v1/status",
             "/v1/doc",
             "/swagger.yml",
             "/metrics",
             "/profile/cpu",
-        ]
-        .iter()
-        .map(|v| v.parse().unwrap())
-        .collect::<Vec<_>>();
+        ];
         Ok(Self {
             conf,
             tenant_manager,
@@ -3155,7 +3152,7 @@ pub fn make_router(
     if auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             let state = get_state(request);
-            if state.allowlist_routes.contains(request.uri()) {
+            if state.allowlist_routes.contains(&request.uri().path()) {
                 None
             } else {
                 state.auth.as_deref()
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9a5a1c58b6..28294abdb9 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,7 +1,6 @@
-use hyper::{Body, Request, Response, StatusCode, Uri};
-use once_cell::sync::Lazy;
+use hyper::{Body, Request, Response, StatusCode};
 use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::fmt;
 use std::io::Write as _;
 use std::str::FromStr;
@@ -574,14 +573,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
     let mut router = endpoint::make_router();
     if conf.http_auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
-            #[allow(clippy::mutable_key_type)]
-            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> = Lazy::new(|| {
-                ["/v1/status", "/metrics", "/pprof/profile"]
-                    .iter()
-                    .map(|v| v.parse().unwrap())
-                    .collect()
-            });
-            if ALLOWLIST_ROUTES.contains(request.uri()) {
+            const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"];
+            if ALLOWLIST_ROUTES.contains(&request.uri().path()) {
                 None
             } else {
                 // Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().

From 6f8b1eb5a6d66f111ea6143b56ab185f6c3244d6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:21:51 -0500
Subject: [PATCH 37/51] test(pageserver): add detach ancestor smoke test
 (#9842)

## Problem

Follow up to https://github.com/neondatabase/neon/pull/9682, hopefully
we can detect some issues or assure ourselves that this is ready for
production.

## Summary of changes

* Add a compaction-detach-ancestor smoke test.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/pageserver/http.py       |  2 +-
 test_runner/fixtures/workload.py              |  5 +-
 .../regress/test_timeline_detach_ancestor.py  | 54 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 98330ba350..4df624def3 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -343,7 +343,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json["tenant_shards"], list)
         return res_json
 
-    def tenant_get_location(self, tenant_id: TenantShardId):
+    def tenant_get_location(self, tenant_id: TenantId | TenantShardId):
         res = self.get(
             f"http://localhost:{self.port}/v1/location_config/{tenant_id}",
         )
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 72dc102538..4c6b2b6b3e 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -94,9 +94,10 @@ class Workload:
     def __del__(self):
         self.stop()
 
-    def init(self, pageserver_id: int | None = None):
+    def init(self, pageserver_id: int | None = None, allow_recreate=False):
         endpoint = self.endpoint(pageserver_id)
-
+        if allow_recreate:
+            endpoint.safe_psql(f"DROP TABLE IF EXISTS {self.table};")
         endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
         endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
         last_flush_lsn_upload(
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 1547ebc35d..cd4e0a5f3b 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -23,7 +23,8 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.utils import assert_pageserver_backups_equal, wait_until
+from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until
+from fixtures.workload import Workload
 from requests import ReadTimeout
 
 
@@ -1550,6 +1551,57 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
     env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBuilder):
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": f"{1024 ** 2}",
+        "lsn_lease_length": "0s",
+        # Small checkpoint distance to create many layers
+        "checkpoint_distance": 1024**2,
+        # Compact small layers
+        "compaction_target_size": 1024**2,
+        "image_creation_threshold": 2,
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 50
+
+    ps_http = env.pageserver.http_client()
+
+    workload_parent = Workload(env, tenant_id, timeline_id)
+    workload_parent.init(env.pageserver.id)
+    log.info("Writing initial data ...")
+    workload_parent.write_rows(row_count, env.pageserver.id)
+    branch_id = env.create_branch("child")
+    workload_child = Workload(env, tenant_id, branch_id, branch_name="child")
+    workload_child.init(env.pageserver.id, allow_recreate=True)
+    log.info("Writing initial data on child...")
+    workload_child.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload_parent.churn_rows(row_count, env.pageserver.id)
+        workload_child.churn_rows(row_count, env.pageserver.id)
+
+    ps_http.detach_ancestor(tenant_id, branch_id)
+
+    log.info("Validating at workload end ...")
+    workload_parent.validate(env.pageserver.id)
+    workload_child.validate(env.pageserver.id)
+
+
 # TODO:
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.

From c1937d073fdf13984da1781e3ca3ef83b22ea092 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:30:53 -0500
Subject: [PATCH 38/51] fix(pageserver): ensure upload happens after delete
 (#9844)

## Problem

Follow up of https://github.com/neondatabase/neon/pull/9682, that patch
didn't fully address the problem: what if shutdown fails due to whatever
reason and then we reattach the tenant? Then we will still remove the
future layer. The underlying problem is that the fix for #5878 gets
voided because of the generation optimizations.

Of course, we also need to ensure that delete happens after uploads, but
note that we only schedule deletes when there are no ongoing upload
tasks, so that's fine.

## Summary of changes

* Add a test case to reproduce the behavior (by changing the original
test case to attach the same generation).
* If layer upload happens after the deletion, drain the deletion queue
before uploading.
* If blocked_deletion is enabled, directly remove it from the
blocked_deletion queue.
* Local fs backend fix to avoid race between deletion and preload.
* test_emergency_mode does not need to wait for uploads (and it's
generally not possible to wait for uploads).
* ~~Optimize deletion executor to skip validation if there are no files
to delete.~~ this doesn't work

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/remote_storage/src/local_fs.rs           |  7 +-
 pageserver/src/deletion_queue/deleter.rs      |  2 +
 .../src/tenant/remote_timeline_client.rs      | 97 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             |  1 +
 pageserver/src/tenant/upload_queue.rs         | 23 +++--
 test_runner/fixtures/neon_fixtures.py         | 22 ++++-
 test_runner/fixtures/pageserver/http.py       |  4 +-
 .../regress/test_layers_from_future.py        | 64 ++++++++----
 .../regress/test_pageserver_generations.py    |  6 +-
 9 files changed, 184 insertions(+), 42 deletions(-)

diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 553153826e..ee2fc9d6e2 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -360,7 +360,12 @@ impl RemoteStorage for LocalFs {
             let mut objects = Vec::with_capacity(keys.len());
             for key in keys {
                 let path = key.with_base(&self.storage_root);
-                let metadata = file_metadata(&path).await?;
+                let metadata = file_metadata(&path).await;
+                if let Err(DownloadError::NotFound) = metadata {
+                    // Race: if the file is deleted between listing and metadata check, ignore it.
+                    continue;
+                }
+                let metadata = metadata?;
                 if metadata.is_dir() {
                     continue;
                 }
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index 1f04bc0410..3d02387c98 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -15,6 +15,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
 use utils::backoff;
+use utils::pausable_failpoint;
 
 use crate::metrics;
 
@@ -90,6 +91,7 @@ impl Deleter {
     /// Block until everything in accumulator has been executed
     async fn flush(&mut self) -> Result<(), DeletionQueueError> {
         while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+            pausable_failpoint!("deletion-queue-before-execute-pause");
             match self.remote_delete().await {
                 Ok(()) => {
                     // Note: we assume that the remote storage layer returns Ok(()) if some
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 377bc23542..4c88282214 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -199,7 +199,7 @@ use utils::backoff::{
 use utils::pausable_failpoint;
 use utils::shard::ShardNumber;
 
-use std::collections::{HashMap, VecDeque};
+use std::collections::{HashMap, HashSet, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex, OnceLock};
 use std::time::Duration;
@@ -223,7 +223,7 @@ use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
-use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
+use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable};
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
     config::PageServerConf,
@@ -1090,7 +1090,7 @@ impl RemoteTimelineClient {
             "scheduled layer file upload {layer}",
         );
 
-        let op = UploadOp::UploadLayer(layer, metadata);
+        let op = UploadOp::UploadLayer(layer, metadata, None);
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
     }
@@ -1805,7 +1805,7 @@ impl RemoteTimelineClient {
                     // have finished.
                     upload_queue.inprogress_tasks.is_empty()
                 }
-                UploadOp::Delete(_) => {
+                UploadOp::Delete(..) => {
                     // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                     upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                 }
@@ -1833,19 +1833,32 @@ impl RemoteTimelineClient {
             }
 
             // We can launch this task. Remove it from the queue first.
-            let next_op = upload_queue.queued_operations.pop_front().unwrap();
+            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
 
             debug!("starting op: {}", next_op);
 
-            // Update the counters
-            match next_op {
-                UploadOp::UploadLayer(_, _) => {
+            // Update the counters and prepare
+            match &mut next_op {
+                UploadOp::UploadLayer(layer, meta, mode) => {
+                    if upload_queue
+                        .recently_deleted
+                        .remove(&(layer.layer_desc().layer_name().clone(), meta.generation))
+                    {
+                        *mode = Some(OpType::FlushDeletion);
+                    } else {
+                        *mode = Some(OpType::MayReorder)
+                    }
                     upload_queue.num_inprogress_layer_uploads += 1;
                 }
                 UploadOp::UploadMetadata { .. } => {
                     upload_queue.num_inprogress_metadata_uploads += 1;
                 }
-                UploadOp::Delete(_) => {
+                UploadOp::Delete(Delete { layers }) => {
+                    for (name, meta) in layers {
+                        upload_queue
+                            .recently_deleted
+                            .insert((name.clone(), meta.generation));
+                    }
                     upload_queue.num_inprogress_deletions += 1;
                 }
                 UploadOp::Barrier(sender) => {
@@ -1921,7 +1934,66 @@ impl RemoteTimelineClient {
             }
 
             let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                    if let Some(OpType::FlushDeletion) = mode {
+                        if self.config.read().unwrap().block_deletions {
+                            // Of course, this is not efficient... but usually the queue should be empty.
+                            let mut queue_locked = self.upload_queue.lock().unwrap();
+                            let mut detected = false;
+                            if let Ok(queue) = queue_locked.initialized_mut() {
+                                for list in queue.blocked_deletions.iter_mut() {
+                                    list.layers.retain(|(name, meta)| {
+                                        if name == &layer.layer_desc().layer_name()
+                                            && meta.generation == layer_metadata.generation
+                                        {
+                                            detected = true;
+                                            // remove the layer from deletion queue
+                                            false
+                                        } else {
+                                            // keep the layer
+                                            true
+                                        }
+                                    });
+                                }
+                            }
+                            if detected {
+                                info!(
+                                    "cancelled blocked deletion of layer {} at gen {:?}",
+                                    layer.layer_desc().layer_name(),
+                                    layer_metadata.generation
+                                );
+                            }
+                        } else {
+                            // TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions
+                            // that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted,
+                            // which is not possible in the current system.
+                            info!(
+                                "waiting for deletion queue flush to complete before uploading layer {} at gen {:?}",
+                                layer.layer_desc().layer_name(),
+                                layer_metadata.generation
+                            );
+                            {
+                                // We are going to flush, we can clean up the recently deleted list.
+                                let mut queue_locked = self.upload_queue.lock().unwrap();
+                                if let Ok(queue) = queue_locked.initialized_mut() {
+                                    queue.recently_deleted.clear();
+                                }
+                            }
+                            if let Err(e) = self.deletion_queue_client.flush_execute().await {
+                                warn!(
+                                    "failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ",
+                                    layer.layer_desc().layer_name(),
+                                    layer_metadata.generation
+                                );
+                            } else {
+                                info!(
+                                    "done flushing deletion queue before uploading layer {} at gen {:?}",
+                                    layer.layer_desc().layer_name(),
+                                    layer_metadata.generation
+                                );
+                            }
+                        }
+                    }
                     let local_path = layer.local_path();
 
                     // We should only be uploading layers created by this `Tenant`'s lifetime, so
@@ -2085,7 +2157,7 @@ impl RemoteTimelineClient {
             upload_queue.inprogress_tasks.remove(&task.task_id);
 
             let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _) => {
+                UploadOp::UploadLayer(_, _, _) => {
                     upload_queue.num_inprogress_layer_uploads -= 1;
                     None
                 }
@@ -2162,7 +2234,7 @@ impl RemoteTimelineClient {
     )> {
         use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
         let res = match op {
-            UploadOp::UploadLayer(_, m) => (
+            UploadOp::UploadLayer(_, m, _) => (
                 RemoteOpFileKind::Layer,
                 RemoteOpKind::Upload,
                 RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
@@ -2259,6 +2331,7 @@ impl RemoteTimelineClient {
                         blocked_deletions: Vec::new(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        recently_deleted: HashSet::new(),
                     };
 
                     let upload_queue = std::mem::replace(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0c7f3204f6..d1285e7c8a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2652,6 +2652,7 @@ impl Timeline {
         //
         // NB: generation numbers naturally protect against this because they disambiguate
         //     (1) and (4)
+        // TODO: this is basically a no-op now, should we remove it?
         self.remote_client.schedule_barrier()?;
         // Tenant::create_timeline will wait for these uploads to happen before returning, or
         // on retry.
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index f14bf2f8c3..ef3aa759f3 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use std::collections::HashSet;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
@@ -14,7 +15,6 @@ use utils::lsn::AtomicLsn;
 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;
 
-#[cfg(feature = "testing")]
 use utils::generation::Generation;
 
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
@@ -38,6 +38,12 @@ impl UploadQueue {
     }
 }
 
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub(crate) enum OpType {
+    MayReorder,
+    FlushDeletion,
+}
+
 /// This keeps track of queued and in-progress tasks.
 pub(crate) struct UploadQueueInitialized {
     /// Counter to assign task IDs
@@ -88,6 +94,9 @@ pub(crate) struct UploadQueueInitialized {
     #[cfg(feature = "testing")]
     pub(crate) dangling_files: HashMap<LayerName, Generation>,
 
+    /// Ensure we order file operations correctly.
+    pub(crate) recently_deleted: HashSet<(LayerName, Generation)>,
+
     /// Deletions that are blocked by the tenant configuration
     pub(crate) blocked_deletions: Vec<Delete>,
 
@@ -183,6 +192,7 @@ impl UploadQueue {
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
             dangling_files: HashMap::new(),
+            recently_deleted: HashSet::new(),
             blocked_deletions: Vec::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
@@ -224,6 +234,7 @@ impl UploadQueue {
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
             dangling_files: HashMap::new(),
+            recently_deleted: HashSet::new(),
             blocked_deletions: Vec::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
@@ -282,8 +293,8 @@ pub(crate) struct Delete {
 
 #[derive(Debug)]
 pub(crate) enum UploadOp {
-    /// Upload a layer file
-    UploadLayer(ResidentLayer, LayerFileMetadata),
+    /// Upload a layer file. The last field indicates the last operation for thie file.
+    UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
 
     /// Upload a index_part.json file
     UploadMetadata {
@@ -305,11 +316,11 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match self {
-            UploadOp::UploadLayer(layer, metadata) => {
+            UploadOp::UploadLayer(layer, metadata, mode) => {
                 write!(
                     f,
-                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer, metadata.file_size, metadata.generation
+                    "UploadLayer({}, size={:?}, gen={:?}, mode={:?})",
+                    layer, metadata.file_size, metadata.generation, mode
                 )
             }
             UploadOp::UploadMetadata { uploaded, .. } => {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e04cadf46f..d8d2b87b4e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4942,6 +4942,7 @@ def last_flush_lsn_upload(
     timeline_id: TimelineId,
     pageserver_id: int | None = None,
     auth_token: str | None = None,
+    wait_until_uploaded: bool = True,
 ) -> Lsn:
     """
     Wait for pageserver to catch to the latest flush LSN of given endpoint,
@@ -4955,7 +4956,9 @@ def last_flush_lsn_upload(
     for tenant_shard_id, pageserver in shards:
         ps_http = pageserver.http_client(auth_token=auth_token)
         wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
-        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True)
+        ps_http.timeline_checkpoint(
+            tenant_shard_id, timeline_id, wait_until_uploaded=wait_until_uploaded
+        )
     return last_flush_lsn
 
 
@@ -4980,6 +4983,7 @@ def generate_uploads_and_deletions(
     timeline_id: TimelineId | None = None,
     data: str | None = None,
     pageserver: NeonPageserver,
+    wait_until_uploaded: bool = True,
 ):
     """
     Using the environment's default tenant + timeline, generate a load pattern
@@ -5002,7 +5006,12 @@ def generate_uploads_and_deletions(
         if init:
             endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
             last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+                env,
+                endpoint,
+                tenant_id,
+                timeline_id,
+                pageserver_id=pageserver.id,
+                wait_until_uploaded=wait_until_uploaded,
             )
 
         def churn(data):
@@ -5025,7 +5034,12 @@ def generate_uploads_and_deletions(
             # in a state where there are "future layers" in remote storage that will generate deletions
             # after a restart.
             last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+                env,
+                endpoint,
+                tenant_id,
+                timeline_id,
+                pageserver_id=pageserver.id,
+                wait_until_uploaded=wait_until_uploaded,
             )
 
         # Compaction should generate some GC-elegible layers
@@ -5041,4 +5055,4 @@ def generate_uploads_and_deletions(
         # background ingest, no more uploads pending, and therefore no non-determinism
         # in subsequent actions like pageserver restarts.
         flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=wait_until_uploaded)
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4df624def3..56386fdd37 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -794,7 +794,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         if compact is not None:
             query["compact"] = "true" if compact else "false"
 
-        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
+        log.info(
+            f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
+        )
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
             params=query,
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 309e0f3015..761ec7568f 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import time
 
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
@@ -19,7 +20,11 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import query_scalar, wait_until
 
 
-def test_issue_5878(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "attach_mode",
+    ["default_generation", "same_generation"],
+)
+def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
     """
     Regression test for issue https://github.com/neondatabase/neon/issues/5878 .
 
@@ -168,11 +173,32 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     tenant_conf = ps_http.tenant_config(tenant_id)
     generation_before_detach = get_generation_number()
     env.pageserver.tenant_detach(tenant_id)
-    failpoint_name = "before-delete-layer-pausable"
+    failpoint_deletion_queue = "deletion-queue-before-execute-pause"
 
-    ps_http.configure_failpoints((failpoint_name, "pause"))
-    env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
-    generation_after_reattach = get_generation_number()
+    ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))
+
+    if attach_mode == "default_generation":
+        env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
+    elif attach_mode == "same_generation":
+        # Attach with the same generation number -- this is possible with timeline offload and detach ancestor
+        env.pageserver.tenant_attach(
+            tenant_id,
+            tenant_conf.tenant_specific_overrides,
+            generation=generation_before_detach,
+            # We want to avoid the generation bump and don't want to talk with the storcon
+            override_storage_controller_generation=False,
+        )
+    else:
+        raise AssertionError(f"Unknown attach_mode: {attach_mode}")
+
+    # Get it from pageserver API instead of storcon API b/c we might not have attached using the storcon
+    # API if attach_mode == "same_generation"
+    tenant_location = env.pageserver.http_client().tenant_get_location(tenant_id)
+    generation_after_reattach = tenant_location["generation"]
+
+    if attach_mode == "same_generation":
+        # The generation number should be the same as before the detach
+        assert generation_before_detach == generation_after_reattach
     wait_until_tenant_active(ps_http, tenant_id)
 
     # Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
@@ -182,15 +208,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
 
     wait_until(10, 0.5, future_layer_is_gone_from_index_part)
 
-    # NB: the layer file is unlinked index part now, but, because we made the delete
-    # operation stuck, the layer file itself is still in the remote_storage
-    wait_until(
-        10,
-        0.5,
-        lambda: env.pageserver.assert_log_contains(
-            f".*{tenant_id}.*at failpoint.*{failpoint_name}"
-        ),
-    )
+    # We already make deletion stuck here, but we don't necessarily hit the failpoint
+    # because deletions are batched.
     future_layer_path = env.pageserver_remote_storage.remote_layer_path(
         tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
     )
@@ -224,11 +243,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
             break
         time.sleep(1)
 
-    # Window has passed, unstuck the delete, let upload queue drain.
+    # Window has passed, unstuck the delete, let deletion queue drain; the upload queue should
+    # have drained because we put these layer deletion operations into the deletion queue and
+    # have consumed the operation from the upload queue.
     log.info("unstuck the DELETE")
-    ps_http.configure_failpoints(("before-delete-layer-pausable", "off"))
-
+    ps_http.configure_failpoints((failpoint_deletion_queue, "off"))
     wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+    env.pageserver.http_client().deletion_queue_flush(True)
 
     # Examine the resulting S3 state.
     log.info("integrity-check the remote storage")
@@ -247,3 +268,12 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     final_stat = future_layer_path.stat()
     log.info(f"future layer path: {future_layer_path}")
     assert final_stat.st_mtime != pre_stat.st_mtime
+
+    # Ensure no weird errors in the end...
+    wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+
+    if attach_mode == "same_generation":
+        # we should have detected a race upload and deferred it
+        env.pageserver.assert_log_contains(
+            "waiting for deletion queue flush to complete before uploading layer"
+        )
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index d5bbfbc7fc..6ba5753420 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -459,7 +459,11 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env.pageserver.start()
 
     # The pageserver should provide service to clients
-    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
+    # Because it is in emergency mode, it will not attempt to validate deletions required by the initial barrier, and therefore
+    # other files cannot be uploaded b/c it's waiting for the initial barrier to be validated.
+    generate_uploads_and_deletions(
+        env, init=False, pageserver=env.pageserver, wait_until_uploaded=False
+    )
 
     # The pageserver should neither validate nor execute any deletions, it should have
     # loaded the DeletionLists from before though

From 3245f7b88dc0f7d73d84e545dcde38a7945b8489 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 22 Nov 2024 19:27:04 +0000
Subject: [PATCH 39/51] Rename 'installed_extensions' metric to
 'compute_installed_extensions' (#9759)

to keep it consistent with existing compute metrics.

flux-fleet change is not needed, because it doesn't have any filter by
metric name for compute metrics.
---
 compute_tools/src/installed_extensions.rs     |  2 +-
 .../regress/test_installed_extensions.py      | 20 +++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 6dd55855db..79d8b2ca04 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -115,7 +115,7 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
 
 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
-        "installed_extensions",
+        "compute_installed_extensions",
         "Number of databases where the version of extension is installed",
         &["extension_name", "version"]
     )
diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py
index 54ce7c8340..04ccec5875 100644
--- a/test_runner/regress/test_installed_extensions.py
+++ b/test_runner/regress/test_installed_extensions.py
@@ -99,11 +99,15 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     res = client.metrics()
     info("Metrics: %s", res)
     m = parse_metrics(res)
-    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+    neon_m = m.query_all(
+        "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+    )
     assert len(neon_m) == 1
     for sample in neon_m:
         assert sample.value == 2
-    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+    neon_m = m.query_all(
+        "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+    )
     assert len(neon_m) == 1
     for sample in neon_m:
         assert sample.value == 1
@@ -116,7 +120,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
         try:
             res = client.metrics()
             timeout = -1
-            if len(parse_metrics(res).query_all("installed_extensions")) < 4:
+            if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4:
                 # Assume that not all metrics that are collected yet
                 time.sleep(1)
                 timeout -= 1
@@ -128,17 +132,21 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
             continue
 
         assert (
-            len(parse_metrics(res).query_all("installed_extensions")) >= 4
+            len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4
         ), "Not all metrics are collected"
 
         info("After restart metrics: %s", res)
         m = parse_metrics(res)
-        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+        neon_m = m.query_all(
+            "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+        )
         assert len(neon_m) == 1
         for sample in neon_m:
             assert sample.value == 1
 
-        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+        neon_m = m.query_all(
+            "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+        )
         assert len(neon_m) == 1
         for sample in neon_m:
             assert sample.value == 1

From 450be26bbbf2e39dd2184faed8e6601072006fb1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 22 Nov 2024 23:47:06 +0100
Subject: [PATCH 40/51] fast imports: initial Importer and Storage changes
 (#9218)

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Stas Kelvic <stas@neon.tech>

# Context

This PR contains PoC-level changes for a product feature that allows
onboarding large databases into Neon without going through the regular
data path.

# Changes

This internal RFC provides all the context
* https://github.com/neondatabase/cloud/pull/19799

In the language of the RFC, this PR covers

* the Importer code (`fast_import`)
* all the Pageserver changes (mgmt API changes, flow implementation,
etc)
* a basic test for the Pageserver changes

# Reviewing

As acknowledged in the RFC, the code added in this PR is not ready for
general availability.
Also, the **architecture is not to be discussed in this PR**, but in the
RFC and associated Slack channel instead.

Reviewers of this PR should take that into consideration.
The quality bar to apply during review depends on what area of the code
is being reviewed:

* Importer code (`fast_import`): practically anything goes
* Core flow (`flow.rs`):
* Malicious input data must be expected and the existing threat models
apply.
* The code must not be safe to execute on *dedicated* Pageserver
instances:
* This means in particular that tenants *on other* Pageserver instances
must not be affected negatively wrt data confidentiality, integrity or
availability.
* Other code: the usual quality bar
* Pay special attention to correct use of gate guards, timeline
cancellation in all places during shutdown & migration, etc.
* Consider the broader system impact; if you find potentially
problematic interactions with Storage features that were not covered in
the RFC, bring that up during the review.

I recommend submitting three separate reviews, for the three high-level
areas with different quality bars.


# References

(Internal-only)

* refs https://github.com/neondatabase/cloud/issues/17507
* refs https://github.com/neondatabase/company_projects/issues/293
* refs https://github.com/neondatabase/company_projects/issues/309
* refs https://github.com/neondatabase/cloud/issues/20646

---------

Co-authored-by: Stas Kelvich <stas.kelvich@gmail.com>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: John Spray <john@neon.tech>
---
 Cargo.lock                                    |  47 +-
 Cargo.toml                                    |   8 +-
 compute/compute-node.Dockerfile               |  23 +-
 compute_tools/Cargo.toml                      |   7 +
 compute_tools/src/bin/fast_import.rs          | 338 ++++++++
 .../src/bin/fast_import/child_stdio_to_log.rs |  35 +
 compute_tools/src/bin/fast_import/s3_uri.rs   |  75 ++
 compute_tools/src/bin/fast_import/s5cmd.rs    |  27 +
 control_plane/src/bin/neon_local.rs           |   1 +
 libs/pageserver_api/Cargo.toml                |   1 +
 libs/pageserver_api/src/config.rs             |  13 +
 libs/pageserver_api/src/keyspace.rs           |   4 +-
 libs/pageserver_api/src/models.rs             |  41 +
 libs/postgres_initdb/Cargo.toml               |  12 +
 libs/postgres_initdb/src/lib.rs               | 103 +++
 pageserver/Cargo.toml                         |   2 +
 pageserver/src/config.rs                      |  10 +
 pageserver/src/http/openapi_spec.yml          |  30 +
 pageserver/src/http/routes.rs                 |  31 +
 pageserver/src/pgdatadir_mapping.rs           |  16 +-
 pageserver/src/task_mgr.rs                    |   2 +
 pageserver/src/tenant.rs                      | 577 ++++++++++---
 .../src/tenant/remote_timeline_client.rs      |  13 +
 .../tenant/remote_timeline_client/download.rs |   2 +-
 .../tenant/remote_timeline_client/index.rs    | 100 ++-
 pageserver/src/tenant/timeline.rs             |  28 +-
 .../src/tenant/timeline/import_pgdata.rs      | 218 +++++
 .../src/tenant/timeline/import_pgdata/flow.rs | 798 ++++++++++++++++++
 .../import_pgdata/importbucket_client.rs      | 315 +++++++
 .../import_pgdata/importbucket_format.rs      |  20 +
 .../import_pgdata/index_part_format.rs        |  68 ++
 .../timeline/import_pgdata/upcall_api.rs      | 119 +++
 pageserver/src/tenant/timeline/uninit.rs      |  36 +-
 test_runner/fixtures/common_types.py          |  23 +
 test_runner/fixtures/neon_fixtures.py         |  14 +
 test_runner/fixtures/pageserver/http.py       |  76 +-
 test_runner/fixtures/utils.py                 |   7 +
 test_runner/regress/test_import_pgdata.py     | 307 +++++++
 workspace_hack/Cargo.toml                     |   3 +-
 39 files changed, 3368 insertions(+), 182 deletions(-)
 create mode 100644 compute_tools/src/bin/fast_import.rs
 create mode 100644 compute_tools/src/bin/fast_import/child_stdio_to_log.rs
 create mode 100644 compute_tools/src/bin/fast_import/s3_uri.rs
 create mode 100644 compute_tools/src/bin/fast_import/s5cmd.rs
 create mode 100644 libs/postgres_initdb/Cargo.toml
 create mode 100644 libs/postgres_initdb/src/lib.rs
 create mode 100644 pageserver/src/tenant/timeline/import_pgdata.rs
 create mode 100644 pageserver/src/tenant/timeline/import_pgdata/flow.rs
 create mode 100644 pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
 create mode 100644 pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
 create mode 100644 pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
 create mode 100644 pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
 create mode 100644 test_runner/regress/test_import_pgdata.py

diff --git a/Cargo.lock b/Cargo.lock
index a25fa89c77..665aa4aecc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -374,6 +374,28 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "aws-sdk-kms"
+version = "1.47.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "564a597a3c71a957d60a2e4c62c93d78ee5a0d636531e15b760acad983a5c18e"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-types",
+ "bytes",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
 [[package]]
 name = "aws-sdk-s3"
 version = "1.52.0"
@@ -590,9 +612,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.1"
+version = "1.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87"
+checksum = "a065c0fe6fdbdf9f11817eb68582b2ab4aff9e9c39e986ae48f7ec576c6322db"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -1235,6 +1257,10 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "aws-config",
+ "aws-sdk-kms",
+ "aws-sdk-s3",
+ "base64 0.13.1",
  "bytes",
  "camino",
  "cfg-if",
@@ -1252,13 +1278,16 @@ dependencies = [
  "opentelemetry",
  "opentelemetry_sdk",
  "postgres",
+ "postgres_initdb",
  "prometheus",
  "regex",
  "remote_storage",
  "reqwest 0.12.4",
  "rlimit",
  "rust-ini",
+ "serde",
  "serde_json",
+ "serde_with",
  "signal-hook",
  "tar",
  "thiserror",
@@ -3712,6 +3741,7 @@ dependencies = [
  "num_cpus",
  "once_cell",
  "pageserver_api",
+ "pageserver_client",
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
@@ -3720,6 +3750,7 @@ dependencies = [
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
+ "postgres_initdb",
  "pq_proto",
  "procfs",
  "rand 0.8.5",
@@ -4195,6 +4226,17 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "postgres_initdb"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "camino",
+ "thiserror",
+ "tokio",
+ "workspace_hack",
+]
+
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
@@ -7504,6 +7546,7 @@ dependencies = [
  "anyhow",
  "axum",
  "axum-core",
+ "base64 0.13.1",
  "base64 0.21.1",
  "base64ct",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index aac19a4122..e3dc5b97f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,6 +34,7 @@ members = [
     "libs/vm_monitor",
     "libs/walproposer",
     "libs/wal_decoder",
+    "libs/postgres_initdb",
 ]
 
 [workspace.package]
@@ -57,6 +58,7 @@ async-trait = "0.1"
 aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
 aws-sdk-s3 = "1.52"
 aws-sdk-iam = "1.46.0"
+aws-sdk-kms = "1.47.0"
 aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
@@ -73,7 +75,7 @@ bytes = "1.0"
 camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
-clap = { version = "4.0", features = ["derive"] }
+clap = { version = "4.0", features = ["derive", "env"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
@@ -154,7 +156,7 @@ sentry = { version = "0.32", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
-serde_with = "2.0"
+serde_with = { version = "2.0", features = [ "base64" ] }
 serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
@@ -213,12 +215,14 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
+postgres_initdb = { path = "./libs/postgres_initdb" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
 remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 32405ece86..7c21c67a0a 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1243,7 +1243,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 
 #########################################################################################
 #
-# Compile and run the Neon-specific `compute_ctl` binary
+# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
@@ -1264,6 +1264,7 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
 FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
 
 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
 
 #########################################################################################
 #
@@ -1458,6 +1459,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
 
 # pgbouncer and its config
 COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
@@ -1533,6 +1535,25 @@ RUN apt update && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
+# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2
+# used by fast_import
+ARG TARGETARCH
+ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb
+RUN set -ex; \
+    \
+    # Determine the expected checksum based on TARGETARCH
+    if [ "${TARGETARCH}" = "amd64" ]; then \
+        CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \
+    elif [ "${TARGETARCH}" = "arm64" ]; then \
+        CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \
+    else \
+        echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
+    fi; \
+    \
+    # Compute and validate the checksum
+    echo "${CHECKSUM}  /tmp/s5cmd.deb" | sha256sum -c -
+RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb
+
 ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 0bf4ed53d6..c0c390caef 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -10,6 +10,10 @@ default = []
 testing = []
 
 [dependencies]
+base64.workspace = true
+aws-config.workspace = true
+aws-sdk-s3.workspace = true
+aws-sdk-kms.workspace = true
 anyhow.workspace = true
 camino.workspace = true
 chrono.workspace = true
@@ -27,6 +31,8 @@ opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 postgres.workspace = true
 regex.workspace = true
+serde.workspace = true
+serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
@@ -43,6 +49,7 @@ thiserror.workspace = true
 url.workspace = true
 prometheus.workspace = true
 
+postgres_initdb.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
new file mode 100644
index 0000000000..3b0b990df2
--- /dev/null
+++ b/compute_tools/src/bin/fast_import.rs
@@ -0,0 +1,338 @@
+//! This program dumps a remote Postgres database into a local Postgres database
+//! and uploads the resulting PGDATA into object storage for import into a Timeline.
+//!
+//! # Context, Architecture, Design
+//!
+//! See cloud.git Fast Imports RFC (<https://github.com/neondatabase/cloud/pull/19799>)
+//! for the full picture.
+//! The RFC describing the storage pieces of importing the PGDATA dump into a Timeline
+//! is publicly accessible at <https://github.com/neondatabase/neon/pull/9538>.
+//!
+//! # This is a Prototype!
+//!
+//! This program is part of a prototype feature and not yet used in production.
+//!
+//! The cloud.git RFC contains lots of suggestions for improving e2e throughput
+//! of this step of the timeline import process.
+//!
+//! # Local Testing
+//!
+//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
+//! - Build the image with the following command:
+//!
+//! ```bash
+//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)"  -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com
+//! docker push localhost:3030/localregistry/compute-node-v14:latest
+//! ```
+
+use anyhow::Context;
+use aws_config::BehaviorVersion;
+use camino::{Utf8Path, Utf8PathBuf};
+use clap::Parser;
+use nix::unistd::Pid;
+use tracing::{info, info_span, warn, Instrument};
+use utils::fs_ext::is_directory_empty;
+
+#[path = "fast_import/child_stdio_to_log.rs"]
+mod child_stdio_to_log;
+#[path = "fast_import/s3_uri.rs"]
+mod s3_uri;
+#[path = "fast_import/s5cmd.rs"]
+mod s5cmd;
+
+#[derive(clap::Parser)]
+struct Args {
+    #[clap(long)]
+    working_directory: Utf8PathBuf,
+    #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
+    s3_prefix: s3_uri::S3Uri,
+    #[clap(long)]
+    pg_bin_dir: Utf8PathBuf,
+    #[clap(long)]
+    pg_lib_dir: Utf8PathBuf,
+}
+
+#[serde_with::serde_as]
+#[derive(serde::Deserialize)]
+struct Spec {
+    encryption_secret: EncryptionSecret,
+    #[serde_as(as = "serde_with::base64::Base64")]
+    source_connstring_ciphertext_base64: Vec<u8>,
+}
+
+#[derive(serde::Deserialize)]
+enum EncryptionSecret {
+    #[allow(clippy::upper_case_acronyms)]
+    KMS { key_id: String },
+}
+
+#[tokio::main]
+pub(crate) async fn main() -> anyhow::Result<()> {
+    utils::logging::init(
+        utils::logging::LogFormat::Plain,
+        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        utils::logging::Output::Stdout,
+    )?;
+
+    info!("starting");
+
+    let Args {
+        working_directory,
+        s3_prefix,
+        pg_bin_dir,
+        pg_lib_dir,
+    } = Args::parse();
+
+    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+
+    let spec: Spec = {
+        let spec_key = s3_prefix.append("/spec.json");
+        let s3_client = aws_sdk_s3::Client::new(&aws_config);
+        let object = s3_client
+            .get_object()
+            .bucket(&spec_key.bucket)
+            .key(spec_key.key)
+            .send()
+            .await
+            .context("get spec from s3")?
+            .body
+            .collect()
+            .await
+            .context("download spec body")?;
+        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    };
+
+    match tokio::fs::create_dir(&working_directory).await {
+        Ok(()) => {}
+        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+            if !is_directory_empty(&working_directory)
+                .await
+                .context("check if working directory is empty")?
+            {
+                anyhow::bail!("working directory is not empty");
+            } else {
+                // ok
+            }
+        }
+        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
+    }
+
+    let pgdata_dir = working_directory.join("pgdata");
+    tokio::fs::create_dir(&pgdata_dir)
+        .await
+        .context("create pgdata directory")?;
+
+    //
+    // Setup clients
+    //
+    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+    let kms_client = aws_sdk_kms::Client::new(&aws_config);
+
+    //
+    //  Initialize pgdata
+    //
+    let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
+    postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
+        superuser,
+        locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
+        pg_version: 140000, // XXX: this shouldn't be hard-coded but derived from which compute image we're running in
+        initdb_bin: pg_bin_dir.join("initdb").as_ref(),
+        library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
+        pgdata: &pgdata_dir,
+    })
+    .await
+    .context("initdb")?;
+
+    let nproc = num_cpus::get();
+
+    //
+    // Launch postgres process
+    //
+    let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres"))
+        .arg("-D")
+        .arg(&pgdata_dir)
+        .args(["-c", "wal_level=minimal"])
+        .args(["-c", "shared_buffers=10GB"])
+        .args(["-c", "max_wal_senders=0"])
+        .args(["-c", "fsync=off"])
+        .args(["-c", "full_page_writes=off"])
+        .args(["-c", "synchronous_commit=off"])
+        .args(["-c", "maintenance_work_mem=8388608"])
+        .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
+        .args(["-c", &format!("max_parallel_workers={nproc}")])
+        .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
+        .args(["-c", &format!("max_worker_processes={nproc}")])
+        .args(["-c", "effective_io_concurrency=100"])
+        .env_clear()
+        .stdout(std::process::Stdio::piped())
+        .stderr(std::process::Stdio::piped())
+        .spawn()
+        .context("spawn postgres")?;
+
+    info!("spawned postgres, waiting for it to become ready");
+    tokio::spawn(
+        child_stdio_to_log::relay_process_output(
+            postgres_proc.stdout.take(),
+            postgres_proc.stderr.take(),
+        )
+        .instrument(info_span!("postgres")),
+    );
+    let restore_pg_connstring =
+        format!("host=localhost port=5432 user={superuser} dbname=postgres");
+    loop {
+        let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
+        if res.is_ok() {
+            info!("postgres is ready, could connect to it");
+            break;
+        }
+    }
+
+    //
+    // Decrypt connection string
+    //
+    let source_connection_string = {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let mut output = kms_client
+                    .decrypt()
+                    .key_id(key_id)
+                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+                        spec.source_connstring_ciphertext_base64,
+                    ))
+                    .send()
+                    .await
+                    .context("decrypt source connection string")?;
+                let plaintext = output
+                    .plaintext
+                    .take()
+                    .context("get plaintext source connection string")?;
+                String::from_utf8(plaintext.into_inner())
+                    .context("parse source connection string as utf8")?
+            }
+        }
+    };
+
+    //
+    // Start the work
+    //
+
+    let dumpdir = working_directory.join("dumpdir");
+
+    let common_args = [
+        // schema mapping (prob suffices to specify them on one side)
+        "--no-owner".to_string(),
+        "--no-privileges".to_string(),
+        "--no-publications".to_string(),
+        "--no-security-labels".to_string(),
+        "--no-subscriptions".to_string(),
+        "--no-tablespaces".to_string(),
+        // format
+        "--format".to_string(),
+        "directory".to_string(),
+        // concurrency
+        "--jobs".to_string(),
+        num_cpus::get().to_string(),
+        // progress updates
+        "--verbose".to_string(),
+    ];
+
+    info!("dump into the working directory");
+    {
+        let mut pg_dump = tokio::process::Command::new(pg_bin_dir.join("pg_dump"))
+            .args(&common_args)
+            .arg("-f")
+            .arg(&dumpdir)
+            .arg("--no-sync")
+            // POSITIONAL args
+            // source db (db name included in connection string)
+            .arg(&source_connection_string)
+            // how we run it
+            .env_clear()
+            .kill_on_drop(true)
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .spawn()
+            .context("spawn pg_dump")?;
+
+        info!(pid=%pg_dump.id().unwrap(), "spawned pg_dump");
+
+        tokio::spawn(
+            child_stdio_to_log::relay_process_output(pg_dump.stdout.take(), pg_dump.stderr.take())
+                .instrument(info_span!("pg_dump")),
+        );
+
+        let st = pg_dump.wait().await.context("wait for pg_dump")?;
+        info!(status=?st, "pg_dump exited");
+        if !st.success() {
+            warn!(status=%st, "pg_dump failed, restore will likely fail as well");
+        }
+    }
+
+    // TODO: do it in a streaming way, plenty of internal research done on this already
+    // TODO: do the unlogged table trick
+
+    info!("restore from working directory into vanilla postgres");
+    {
+        let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
+            .args(&common_args)
+            .arg("-d")
+            .arg(&restore_pg_connstring)
+            // POSITIONAL args
+            .arg(&dumpdir)
+            // how we run it
+            .env_clear()
+            .kill_on_drop(true)
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .spawn()
+            .context("spawn pg_restore")?;
+
+        info!(pid=%pg_restore.id().unwrap(), "spawned pg_restore");
+        tokio::spawn(
+            child_stdio_to_log::relay_process_output(
+                pg_restore.stdout.take(),
+                pg_restore.stderr.take(),
+            )
+            .instrument(info_span!("pg_restore")),
+        );
+        let st = pg_restore.wait().await.context("wait for pg_restore")?;
+        info!(status=?st, "pg_restore exited");
+        if !st.success() {
+            warn!(status=%st, "pg_restore failed, restore will likely fail as well");
+        }
+    }
+
+    info!("shutdown postgres");
+    {
+        nix::sys::signal::kill(
+            Pid::from_raw(
+                i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
+            ),
+            nix::sys::signal::SIGTERM,
+        )
+        .context("signal postgres to shut down")?;
+        postgres_proc
+            .wait()
+            .await
+            .context("wait for postgres to shut down")?;
+    }
+
+    info!("upload pgdata");
+    s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/"))
+        .await
+        .context("sync dump directory to destination")?;
+
+    info!("write status");
+    {
+        let status_dir = working_directory.join("status");
+        std::fs::create_dir(&status_dir).context("create status directory")?;
+        let status_file = status_dir.join("status");
+        std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
+            .context("write status file")?;
+        s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata"))
+            .await
+            .context("sync status directory to destination")?;
+    }
+
+    Ok(())
+}
diff --git a/compute_tools/src/bin/fast_import/child_stdio_to_log.rs b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs
new file mode 100644
index 0000000000..6724ef9bed
--- /dev/null
+++ b/compute_tools/src/bin/fast_import/child_stdio_to_log.rs
@@ -0,0 +1,35 @@
+use tokio::io::{AsyncBufReadExt, BufReader};
+use tokio::process::{ChildStderr, ChildStdout};
+use tracing::info;
+
+/// Asynchronously relays the output from a child process's `stdout` and `stderr` to the tracing log.
+/// Each line is read and logged individually, with lossy UTF-8 conversion.
+///
+/// # Arguments
+///
+/// * `stdout`: An `Option<ChildStdout>` from the child process.
+/// * `stderr`: An `Option<ChildStderr>` from the child process.
+///
+pub(crate) async fn relay_process_output(stdout: Option<ChildStdout>, stderr: Option<ChildStderr>) {
+    let stdout_fut = async {
+        if let Some(stdout) = stdout {
+            let reader = BufReader::new(stdout);
+            let mut lines = reader.lines();
+            while let Ok(Some(line)) = lines.next_line().await {
+                info!(fd = "stdout", "{}", line);
+            }
+        }
+    };
+
+    let stderr_fut = async {
+        if let Some(stderr) = stderr {
+            let reader = BufReader::new(stderr);
+            let mut lines = reader.lines();
+            while let Ok(Some(line)) = lines.next_line().await {
+                info!(fd = "stderr", "{}", line);
+            }
+        }
+    };
+
+    tokio::join!(stdout_fut, stderr_fut);
+}
diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs
new file mode 100644
index 0000000000..52bbef420f
--- /dev/null
+++ b/compute_tools/src/bin/fast_import/s3_uri.rs
@@ -0,0 +1,75 @@
+use anyhow::Result;
+use std::str::FromStr;
+
+/// Struct to hold parsed S3 components
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct S3Uri {
+    pub bucket: String,
+    pub key: String,
+}
+
+impl FromStr for S3Uri {
+    type Err = anyhow::Error;
+
+    /// Parse an S3 URI into a bucket and key
+    fn from_str(uri: &str) -> Result<Self> {
+        // Ensure the URI starts with "s3://"
+        if !uri.starts_with("s3://") {
+            return Err(anyhow::anyhow!("Invalid S3 URI scheme"));
+        }
+
+        // Remove the "s3://" prefix
+        let stripped_uri = &uri[5..];
+
+        // Split the remaining string into bucket and key parts
+        if let Some((bucket, key)) = stripped_uri.split_once('/') {
+            Ok(S3Uri {
+                bucket: bucket.to_string(),
+                key: key.to_string(),
+            })
+        } else {
+            Err(anyhow::anyhow!(
+                "Invalid S3 URI format, missing bucket or key"
+            ))
+        }
+    }
+}
+
+impl S3Uri {
+    pub fn append(&self, suffix: &str) -> Self {
+        Self {
+            bucket: self.bucket.clone(),
+            key: format!("{}{}", self.key, suffix),
+        }
+    }
+}
+
+impl std::fmt::Display for S3Uri {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "s3://{}/{}", self.bucket, self.key)
+    }
+}
+
+impl clap::builder::TypedValueParser for S3Uri {
+    type Value = Self;
+
+    fn parse_ref(
+        &self,
+        _cmd: &clap::Command,
+        _arg: Option<&clap::Arg>,
+        value: &std::ffi::OsStr,
+    ) -> Result<Self::Value, clap::Error> {
+        let value_str = value.to_str().ok_or_else(|| {
+            clap::Error::raw(
+                clap::error::ErrorKind::InvalidUtf8,
+                "Invalid UTF-8 sequence",
+            )
+        })?;
+        S3Uri::from_str(value_str).map_err(|e| {
+            clap::Error::raw(
+                clap::error::ErrorKind::InvalidValue,
+                format!("Failed to parse S3 URI: {}", e),
+            )
+        })
+    }
+}
diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/s5cmd.rs
new file mode 100644
index 0000000000..d2d9a79736
--- /dev/null
+++ b/compute_tools/src/bin/fast_import/s5cmd.rs
@@ -0,0 +1,27 @@
+use anyhow::Context;
+use camino::Utf8Path;
+
+use super::s3_uri::S3Uri;
+
+pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> {
+    let mut builder = tokio::process::Command::new("s5cmd");
+    // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL
+    if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") {
+        builder.arg("--endpoint-url").arg(val);
+    }
+    builder
+        .arg("sync")
+        .arg(local.as_str())
+        .arg(remote.to_string());
+    let st = builder
+        .spawn()
+        .context("spawn s5cmd")?
+        .wait()
+        .await
+        .context("wait for s5cmd")?;
+    if st.success() {
+        Ok(())
+    } else {
+        Err(anyhow::anyhow!("s5cmd failed"))
+    }
+}
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index c4063bbd1a..1ea443b026 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1153,6 +1153,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
                 timeline_info.timeline_id
             );
         }
+        // TODO: rename to import-basebackup-plus-wal
         TimelineCmd::Import(args) => {
             let tenant_id = get_tenant_id(args.tenant_id, env)?;
             let timeline_id = args.timeline_id;
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 8710904cec..79da05da6c 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -33,6 +33,7 @@ remote_storage.workspace = true
 postgres_backend.workspace = true
 nix = {workspace = true, optional = true}
 reqwest.workspace = true
+rand.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index ee20613d6d..7666728427 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -97,6 +97,15 @@ pub struct ConfigToml {
     pub control_plane_api: Option<reqwest::Url>,
     pub control_plane_api_token: Option<String>,
     pub control_plane_emergency_mode: bool,
+    /// Unstable feature: subject to change or removal without notice.
+    /// See <https://github.com/neondatabase/neon/pull/9218>.
+    pub import_pgdata_upcall_api: Option<reqwest::Url>,
+    /// Unstable feature: subject to change or removal without notice.
+    /// See <https://github.com/neondatabase/neon/pull/9218>.
+    pub import_pgdata_upcall_api_token: Option<String>,
+    /// Unstable feature: subject to change or removal without notice.
+    /// See <https://github.com/neondatabase/neon/pull/9218>.
+    pub import_pgdata_aws_endpoint_url: Option<reqwest::Url>,
     pub heatmap_upload_concurrency: usize,
     pub secondary_download_concurrency: usize,
     pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
@@ -386,6 +395,10 @@ impl Default for ConfigToml {
             control_plane_api_token: (None),
             control_plane_emergency_mode: (false),
 
+            import_pgdata_upcall_api: (None),
+            import_pgdata_upcall_api_token: (None),
+            import_pgdata_aws_endpoint_url: (None),
+
             heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
             secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
 
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 401887d362..c55b9e9484 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -48,7 +48,7 @@ pub struct ShardedRange<'a> {
 
 // Calculate the size of a range within the blocks of the same relation, or spanning only the
 // top page in the previous relation's space.
-fn contiguous_range_len(range: &Range<Key>) -> u32 {
+pub fn contiguous_range_len(range: &Range<Key>) -> u32 {
     debug_assert!(is_contiguous_range(range));
     if range.start.field6 == 0xffffffff {
         range.end.field6 + 1
@@ -67,7 +67,7 @@ fn contiguous_range_len(range: &Range<Key>) -> u32 {
 /// This matters, because:
 /// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
 /// - Within such ranges, we may calculate distances using simple subtraction of field6.
-fn is_contiguous_range(range: &Range<Key>) -> bool {
+pub fn is_contiguous_range(range: &Range<Key>) -> bool {
     range.start.field1 == range.end.field1
         && range.start.field2 == range.end.field2
         && range.start.field3 == range.end.field3
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0dfa1ba817..1b86bfd91a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,6 +2,8 @@ pub mod detach_ancestor;
 pub mod partitioning;
 pub mod utilization;
 
+#[cfg(feature = "testing")]
+use camino::Utf8PathBuf;
 pub use utilization::PageserverUtilization;
 
 use std::{
@@ -227,6 +229,9 @@ pub enum TimelineCreateRequestMode {
         // we continue to accept it by having it here.
         pg_version: Option<u32>,
     },
+    ImportPgdata {
+        import_pgdata: TimelineCreateRequestModeImportPgdata,
+    },
     // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
     // (serde picks the first matching enum variant, in declaration order).
     Bootstrap {
@@ -236,6 +241,42 @@ pub enum TimelineCreateRequestMode {
     },
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TimelineCreateRequestModeImportPgdata {
+    pub location: ImportPgdataLocation,
+    pub idempotency_key: ImportPgdataIdempotencyKey,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub enum ImportPgdataLocation {
+    #[cfg(feature = "testing")]
+    LocalFs { path: Utf8PathBuf },
+    AwsS3 {
+        region: String,
+        bucket: String,
+        /// A better name for this would be `prefix`; changing requires coordination with cplane.
+        /// See <https://github.com/neondatabase/cloud/issues/20646>.
+        key: String,
+    },
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(transparent)]
+pub struct ImportPgdataIdempotencyKey(pub String);
+
+impl ImportPgdataIdempotencyKey {
+    pub fn random() -> Self {
+        use rand::{distributions::Alphanumeric, Rng};
+        Self(
+            rand::thread_rng()
+                .sample_iter(&Alphanumeric)
+                .take(20)
+                .map(char::from)
+                .collect(),
+        )
+    }
+}
+
 #[derive(Serialize, Deserialize, Clone)]
 pub struct LsnLeaseRequest {
     pub lsn: Lsn,
diff --git a/libs/postgres_initdb/Cargo.toml b/libs/postgres_initdb/Cargo.toml
new file mode 100644
index 0000000000..1605279bce
--- /dev/null
+++ b/libs/postgres_initdb/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "postgres_initdb"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+tokio.workspace = true
+camino.workspace = true
+thiserror.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs
new file mode 100644
index 0000000000..2f072354fb
--- /dev/null
+++ b/libs/postgres_initdb/src/lib.rs
@@ -0,0 +1,103 @@
+//! The canonical way we run `initdb` in Neon.
+//!
+//! initdb has implicit defaults that are dependent on the environment, e.g., locales & collations.
+//!
+//! This module's job is to eliminate the environment-dependence as much as possible.
+
+use std::fmt;
+
+use camino::Utf8Path;
+
+pub struct RunInitdbArgs<'a> {
+    pub superuser: &'a str,
+    pub locale: &'a str,
+    pub initdb_bin: &'a Utf8Path,
+    pub pg_version: u32,
+    pub library_search_path: &'a Utf8Path,
+    pub pgdata: &'a Utf8Path,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    Spawn(std::io::Error),
+    Failed {
+        status: std::process::ExitStatus,
+        stderr: Vec<u8>,
+    },
+    WaitOutput(std::io::Error),
+    Other(anyhow::Error),
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e),
+            Error::Failed { status, stderr } => write!(
+                f,
+                "Command failed with status {:?}: {}",
+                status,
+                String::from_utf8_lossy(stderr)
+            ),
+            Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e),
+            Error::Other(e) => write!(f, "Error: {:?}", e),
+        }
+    }
+}
+
+pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
+    let RunInitdbArgs {
+        superuser,
+        locale,
+        initdb_bin: initdb_bin_path,
+        pg_version,
+        library_search_path,
+        pgdata,
+    } = args;
+    let mut initdb_command = tokio::process::Command::new(initdb_bin_path);
+    initdb_command
+        .args(["--pgdata", pgdata.as_ref()])
+        .args(["--username", superuser])
+        .args(["--encoding", "utf8"])
+        .args(["--locale", locale])
+        .arg("--no-instructions")
+        .arg("--no-sync")
+        .env_clear()
+        .env("LD_LIBRARY_PATH", library_search_path)
+        .env("DYLD_LIBRARY_PATH", library_search_path)
+        .stdin(std::process::Stdio::null())
+        // stdout invocation produces the same output every time, we don't need it
+        .stdout(std::process::Stdio::null())
+        // we would be interested in the stderr output, if there was any
+        .stderr(std::process::Stdio::piped());
+
+    // Before version 14, only the libc provide was available.
+    if pg_version > 14 {
+        // Version 17 brought with it a builtin locale provider which only provides
+        // C and C.UTF-8. While being safer for collation purposes since it is
+        // guaranteed to be consistent throughout a major release, it is also more
+        // performant.
+        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
+
+        initdb_command.args(["--locale-provider", locale_provider]);
+    }
+
+    let initdb_proc = initdb_command.spawn().map_err(Error::Spawn)?;
+
+    // Ideally we'd select here with the cancellation token, but the problem is that
+    // we can't safely terminate initdb: it launches processes of its own, and killing
+    // initdb doesn't kill them. After we return from this function, we want the target
+    // directory to be able to be cleaned up.
+    // See https://github.com/neondatabase/neon/issues/6385
+    let initdb_output = initdb_proc
+        .wait_with_output()
+        .await
+        .map_err(Error::WaitOutput)?;
+    if !initdb_output.status.success() {
+        return Err(Error::Failed {
+            status: initdb_output.status,
+            stderr: initdb_output.stderr,
+        });
+    }
+
+    Ok(())
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 143d8236df..140b287ccc 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -43,6 +43,7 @@ postgres.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
+postgres_initdb.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -68,6 +69,7 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
+pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f7be6ecaab..59ea6fb941 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -144,6 +144,10 @@ pub struct PageServerConf {
     /// JWT token for use with the control plane API.
     pub control_plane_api_token: Option<SecretString>,
 
+    pub import_pgdata_upcall_api: Option<Url>,
+    pub import_pgdata_upcall_api_token: Option<SecretString>,
+    pub import_pgdata_aws_endpoint_url: Option<Url>,
+
     /// If true, pageserver will make best-effort to operate without a control plane: only
     /// for use in major incidents.
     pub control_plane_emergency_mode: bool,
@@ -328,6 +332,9 @@ impl PageServerConf {
             control_plane_api,
             control_plane_api_token,
             control_plane_emergency_mode,
+            import_pgdata_upcall_api,
+            import_pgdata_upcall_api_token,
+            import_pgdata_aws_endpoint_url,
             heatmap_upload_concurrency,
             secondary_download_concurrency,
             ingest_batch_size,
@@ -383,6 +390,9 @@ impl PageServerConf {
             timeline_offloading,
             ephemeral_bytes_per_memory_kb,
             server_side_batch_timeout,
+            import_pgdata_upcall_api,
+            import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
+            import_pgdata_aws_endpoint_url,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 2bc7f5ad39..7fb9247feb 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -623,6 +623,8 @@ paths:
                 existing_initdb_timeline_id:
                   type: string
                   format: hex
+                import_pgdata:
+                  $ref: "#/components/schemas/TimelineCreateRequestImportPgdata"
       responses:
         "201":
           description: Timeline was created, or already existed with matching parameters
@@ -979,6 +981,34 @@ components:
           $ref: "#/components/schemas/TenantConfig"
         effective_config:
           $ref: "#/components/schemas/TenantConfig"
+    TimelineCreateRequestImportPgdata:
+      type: object
+      required:
+        - location
+        - idempotency_key
+      properties:
+        idempotency_key:
+          type: string
+        location:
+          $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation"
+    TimelineCreateRequestImportPgdataLocation:
+      type: object
+      properties:
+        AwsS3:
+          $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3"
+    TimelineCreateRequestImportPgdataLocationAwsS3:
+      type: object
+      properties:
+        region:
+          type: string
+        bucket:
+          type: string
+        key:
+          type: string
+      required:
+        - region
+        - bucket
+        - key
     TimelineInfo:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7168850ed6..ceb1c3b012 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -40,6 +40,7 @@ use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TimelineCreateRequestMode;
+use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
 use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
@@ -81,6 +82,7 @@ use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::timeline::import_pgdata;
 use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
@@ -580,6 +582,35 @@ async fn timeline_create_handler(
             ancestor_timeline_id,
             ancestor_start_lsn,
         }),
+        TimelineCreateRequestMode::ImportPgdata {
+            import_pgdata:
+                TimelineCreateRequestModeImportPgdata {
+                    location,
+                    idempotency_key,
+                },
+        } => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata {
+            idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new(
+                idempotency_key.0,
+            ),
+            new_timeline_id,
+            location: {
+                use import_pgdata::index_part_format::Location;
+                use pageserver_api::models::ImportPgdataLocation;
+                match location {
+                    #[cfg(feature = "testing")]
+                    ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path },
+                    ImportPgdataLocation::AwsS3 {
+                        region,
+                        bucket,
+                        key,
+                    } => Location::AwsS3 {
+                        region,
+                        bucket,
+                        key,
+                    },
+                }
+            },
+        }),
     };
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 5995d1cc57..f4f184be5a 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -2276,9 +2276,9 @@ impl<'a> Version<'a> {
 //--- Metadata structs stored in key-value pairs in the repository.
 
 #[derive(Debug, Serialize, Deserialize)]
-struct DbDirectory {
+pub(crate) struct DbDirectory {
     // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
-    dbdirs: HashMap<(Oid, Oid), bool>,
+    pub(crate) dbdirs: HashMap<(Oid, Oid), bool>,
 }
 
 // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
@@ -2287,8 +2287,8 @@ struct DbDirectory {
 // "pg_twophsae/0000000A000002E4".
 
 #[derive(Debug, Serialize, Deserialize)]
-struct TwoPhaseDirectory {
-    xids: HashSet<TransactionId>,
+pub(crate) struct TwoPhaseDirectory {
+    pub(crate) xids: HashSet<TransactionId>,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -2297,12 +2297,12 @@ struct TwoPhaseDirectoryV17 {
 }
 
 #[derive(Debug, Serialize, Deserialize, Default)]
-struct RelDirectory {
+pub(crate) struct RelDirectory {
     // Set of relations that exist. (relfilenode, forknum)
     //
     // TODO: Store it as a btree or radix tree or something else that spans multiple
     // key-value pairs, if you have a lot of relations
-    rels: HashSet<(Oid, u8)>,
+    pub(crate) rels: HashSet<(Oid, u8)>,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -2311,9 +2311,9 @@ struct RelSizeEntry {
 }
 
 #[derive(Debug, Serialize, Deserialize, Default)]
-struct SlruSegmentDirectory {
+pub(crate) struct SlruSegmentDirectory {
     // Set of SLRU segments that exist.
-    segments: HashSet<u32>,
+    pub(crate) segments: HashSet<u32>,
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 6a4e90dd55..622738022a 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -381,6 +381,8 @@ pub enum TaskKind {
     UnitTest,
 
     DetachAncestor,
+
+    ImportPgdata,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2e5f69e3c9..0214ee68fa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,7 +43,9 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
@@ -373,7 +375,6 @@ pub struct Tenant {
 
     l0_flush_global_state: L0FlushGlobalState,
 }
-
 impl std::fmt::Debug for Tenant {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
@@ -860,6 +861,7 @@ impl Debug for SetStoppingError {
 pub(crate) enum CreateTimelineParams {
     Bootstrap(CreateTimelineParamsBootstrap),
     Branch(CreateTimelineParamsBranch),
+    ImportPgdata(CreateTimelineParamsImportPgdata),
 }
 
 #[derive(Debug)]
@@ -877,7 +879,14 @@ pub(crate) struct CreateTimelineParamsBranch {
     pub(crate) ancestor_start_lsn: Option<Lsn>,
 }
 
-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`].
+#[derive(Debug)]
+pub(crate) struct CreateTimelineParamsImportPgdata {
+    pub(crate) new_timeline_id: TimelineId,
+    pub(crate) location: import_pgdata::index_part_format::Location,
+    pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
+}
+
+/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`] in  [`Tenant::start_creating_timeline`].
 ///
 /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
@@ -907,19 +916,50 @@ pub(crate) enum CreateTimelineIdempotency {
         ancestor_timeline_id: TimelineId,
         ancestor_start_lsn: Lsn,
     },
+    ImportPgdata(CreatingTimelineIdempotencyImportPgdata),
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
+    idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }
 
 /// What is returned by [`Tenant::start_creating_timeline`].
 #[must_use]
-enum StartCreatingTimelineResult<'t> {
-    CreateGuard(TimelineCreateGuard<'t>),
+enum StartCreatingTimelineResult {
+    CreateGuard(TimelineCreateGuard),
     Idempotent(Arc<Timeline>),
 }
 
+enum TimelineInitAndSyncResult {
+    ReadyToActivate(Arc<Timeline>),
+    NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
+}
+
+impl TimelineInitAndSyncResult {
+    fn ready_to_activate(self) -> Option<Arc<Timeline>> {
+        match self {
+            Self::ReadyToActivate(timeline) => Some(timeline),
+            _ => None,
+        }
+    }
+}
+
+#[must_use]
+struct TimelineInitAndSyncNeedsSpawnImportPgdata {
+    timeline: Arc<Timeline>,
+    import_pgdata: import_pgdata::index_part_format::Root,
+    guard: TimelineCreateGuard,
+}
+
 /// What is returned by [`Tenant::create_timeline`].
 enum CreateTimelineResult {
     Created(Arc<Timeline>),
     Idempotent(Arc<Timeline>),
+    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
+    /// we return this result, nor will this concrete object ever be added there.
+    /// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
+    ImportSpawned(Arc<Timeline>),
 }
 
 impl CreateTimelineResult {
@@ -927,18 +967,19 @@ impl CreateTimelineResult {
         match self {
             Self::Created(_) => "Created",
             Self::Idempotent(_) => "Idempotent",
+            Self::ImportSpawned(_) => "ImportSpawned",
         }
     }
     fn timeline(&self) -> &Arc<Timeline> {
         match self {
-            Self::Created(t) | Self::Idempotent(t) => t,
+            Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
         }
     }
     /// Unit test timelines aren't activated, test has to do it if it needs to.
     #[cfg(test)]
     fn into_timeline_for_test(self) -> Arc<Timeline> {
         match self {
-            Self::Created(t) | Self::Idempotent(t) => t,
+            Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
         }
     }
 }
@@ -962,33 +1003,13 @@ pub enum CreateTimelineError {
 }
 
 #[derive(thiserror::Error, Debug)]
-enum InitdbError {
-    Other(anyhow::Error),
+pub enum InitdbError {
+    #[error("Operation was cancelled")]
     Cancelled,
-    Spawn(std::io::Result<()>),
-    Failed(std::process::ExitStatus, Vec<u8>),
-}
-
-impl fmt::Display for InitdbError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            InitdbError::Cancelled => write!(f, "Operation was cancelled"),
-            InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e),
-            InitdbError::Failed(status, stderr) => write!(
-                f,
-                "Command failed with status {:?}: {}",
-                status,
-                String::from_utf8_lossy(stderr)
-            ),
-            InitdbError::Other(e) => write!(f, "Error: {:?}", e),
-        }
-    }
-}
-
-impl From<std::io::Error> for InitdbError {
-    fn from(error: std::io::Error) -> Self {
-        InitdbError::Spawn(Err(error))
-    }
+    #[error(transparent)]
+    Other(anyhow::Error),
+    #[error(transparent)]
+    Inner(postgres_initdb::Error),
 }
 
 enum CreateTimelineCause {
@@ -996,6 +1017,15 @@ enum CreateTimelineCause {
     Delete,
 }
 
+enum LoadTimelineCause {
+    Attach,
+    Unoffload,
+    ImportPgdata {
+        create_guard: TimelineCreateGuard,
+        activate: ActivateTimelineArgs,
+    },
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GcError {
     // The tenant is shutting down
@@ -1072,24 +1102,35 @@ impl Tenant {
     /// it is marked as Active.
     #[allow(clippy::too_many_arguments)]
     async fn timeline_init_and_sync(
-        &self,
+        self: &Arc<Self>,
         timeline_id: TimelineId,
         resources: TimelineResources,
-        index_part: IndexPart,
+        mut index_part: IndexPart,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
-        _ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        cause: LoadTimelineCause,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<TimelineInitAndSyncResult> {
         let tenant_id = self.tenant_shard_id;
 
-        let idempotency = if metadata.ancestor_timeline().is_none() {
-            CreateTimelineIdempotency::Bootstrap {
-                pg_version: metadata.pg_version(),
+        let import_pgdata = index_part.import_pgdata.take();
+        let idempotency = match &import_pgdata {
+            Some(import_pgdata) => {
+                CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
+                    idempotency_key: import_pgdata.idempotency_key().clone(),
+                })
             }
-        } else {
-            CreateTimelineIdempotency::Branch {
-                ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
-                ancestor_start_lsn: metadata.ancestor_lsn(),
+            None => {
+                if metadata.ancestor_timeline().is_none() {
+                    CreateTimelineIdempotency::Bootstrap {
+                        pg_version: metadata.pg_version(),
+                    }
+                } else {
+                    CreateTimelineIdempotency::Branch {
+                        ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
+                        ancestor_start_lsn: metadata.ancestor_lsn(),
+                    }
+                }
             }
         };
 
@@ -1121,39 +1162,91 @@ impl Tenant {
                 format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
             })?;
 
-        {
-            // avoiding holding it across awaits
-            let mut timelines_accessor = self.timelines.lock().unwrap();
-            match timelines_accessor.entry(timeline_id) {
-                // We should never try and load the same timeline twice during startup
-                Entry::Occupied(_) => {
-                    unreachable!(
-                        "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
-                    );
+        match import_pgdata {
+            Some(import_pgdata) if !import_pgdata.is_done() => {
+                match cause {
+                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
+                    LoadTimelineCause::ImportPgdata { .. } => {
+                        unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3")
+                    }
                 }
-                Entry::Vacant(v) => {
-                    v.insert(Arc::clone(&timeline));
-                    timeline.maybe_spawn_flush_loop();
+                let mut guard = self.timelines_creating.lock().unwrap();
+                if !guard.insert(timeline_id) {
+                    // We should never try and load the same timeline twice during startup
+                    unreachable!("Timeline {tenant_id}/{timeline_id} is already being created")
                 }
+                let timeline_create_guard = TimelineCreateGuard {
+                    _tenant_gate_guard: self.gate.enter()?,
+                    owning_tenant: self.clone(),
+                    timeline_id,
+                    idempotency,
+                    // The users of this specific return value don't need the timline_path in there.
+                    timeline_path: timeline
+                        .conf
+                        .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id),
+                };
+                Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
+                    TimelineInitAndSyncNeedsSpawnImportPgdata {
+                        timeline,
+                        import_pgdata,
+                        guard: timeline_create_guard,
+                    },
+                ))
             }
-        };
+            Some(_) | None => {
+                {
+                    let mut timelines_accessor = self.timelines.lock().unwrap();
+                    match timelines_accessor.entry(timeline_id) {
+                        // We should never try and load the same timeline twice during startup
+                        Entry::Occupied(_) => {
+                            unreachable!(
+                            "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
+                        );
+                        }
+                        Entry::Vacant(v) => {
+                            v.insert(Arc::clone(&timeline));
+                            timeline.maybe_spawn_flush_loop();
+                        }
+                    }
+                }
 
-        // Sanity check: a timeline should have some content.
-        anyhow::ensure!(
-            ancestor.is_some()
-                || timeline
-                    .layers
-                    .read()
-                    .await
-                    .layer_map()
-                    .expect("currently loading, layer manager cannot be shutdown already")
-                    .iter_historic_layers()
-                    .next()
-                    .is_some(),
-            "Timeline has no ancestor and no layer files"
-        );
+                // Sanity check: a timeline should have some content.
+                anyhow::ensure!(
+                    ancestor.is_some()
+                        || timeline
+                            .layers
+                            .read()
+                            .await
+                            .layer_map()
+                            .expect("currently loading, layer manager cannot be shutdown already")
+                            .iter_historic_layers()
+                            .next()
+                            .is_some(),
+                    "Timeline has no ancestor and no layer files"
+                );
 
-        Ok(())
+                match cause {
+                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
+                    LoadTimelineCause::ImportPgdata {
+                        create_guard,
+                        activate,
+                    } => {
+                        // TODO: see the comment in the task code above how I'm not so certain
+                        // it is safe to activate here because of concurrent shutdowns.
+                        match activate {
+                            ActivateTimelineArgs::Yes { broker_client } => {
+                                info!("activating timeline after reload from pgdata import task");
+                                timeline.activate(self.clone(), broker_client, None, ctx);
+                            }
+                            ActivateTimelineArgs::No => (),
+                        }
+                        drop(create_guard);
+                    }
+                }
+
+                Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline))
+            }
+        }
     }
 
     /// Attach a tenant that's available in cloud storage.
@@ -1578,24 +1671,46 @@ impl Tenant {
             }
 
             // TODO again handle early failure
-            self.load_remote_timeline(
-                timeline_id,
-                index_part,
-                remote_metadata,
-                TimelineResources {
-                    remote_client,
-                    timeline_get_throttle: self.timeline_get_throttle.clone(),
-                    l0_flush_global_state: self.l0_flush_global_state.clone(),
-                },
-                ctx,
-            )
-            .await
-            .with_context(|| {
-                format!(
-                    "failed to load remote timeline {} for tenant {}",
-                    timeline_id, self.tenant_shard_id
+            let effect = self
+                .load_remote_timeline(
+                    timeline_id,
+                    index_part,
+                    remote_metadata,
+                    TimelineResources {
+                        remote_client,
+                        timeline_get_throttle: self.timeline_get_throttle.clone(),
+                        l0_flush_global_state: self.l0_flush_global_state.clone(),
+                    },
+                    LoadTimelineCause::Attach,
+                    ctx,
                 )
-            })?;
+                .await
+                .with_context(|| {
+                    format!(
+                        "failed to load remote timeline {} for tenant {}",
+                        timeline_id, self.tenant_shard_id
+                    )
+                })?;
+
+            match effect {
+                TimelineInitAndSyncResult::ReadyToActivate(_) => {
+                    // activation happens later, on Tenant::activate
+                }
+                TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
+                    TimelineInitAndSyncNeedsSpawnImportPgdata {
+                        timeline,
+                        import_pgdata,
+                        guard,
+                    },
+                ) => {
+                    tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
+                        timeline,
+                        import_pgdata,
+                        ActivateTimelineArgs::No,
+                        guard,
+                    ));
+                }
+            }
         }
 
         // Walk through deleted timelines, resume deletion
@@ -1719,13 +1834,14 @@ impl Tenant {
 
     #[instrument(skip_all, fields(timeline_id=%timeline_id))]
     async fn load_remote_timeline(
-        &self,
+        self: &Arc<Self>,
         timeline_id: TimelineId,
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
         resources: TimelineResources,
+        cause: LoadTimelineCause,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<TimelineInitAndSyncResult> {
         span::debug_assert_current_span_has_tenant_id();
 
         info!("downloading index file for timeline {}", timeline_id);
@@ -1752,6 +1868,7 @@ impl Tenant {
             index_part,
             remote_metadata,
             ancestor,
+            cause,
             ctx,
         )
         .await
@@ -1938,6 +2055,7 @@ impl Tenant {
                     TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
                 }
                 TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
+                TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled,
             })?;
 
         let timeline_preload = self
@@ -1976,6 +2094,7 @@ impl Tenant {
             index_part,
             remote_metadata,
             timeline_resources,
+            LoadTimelineCause::Unoffload,
             &ctx,
         )
         .await
@@ -2213,7 +2332,7 @@ impl Tenant {
     ///
     /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
     pub(crate) async fn create_empty_timeline(
-        &self,
+        self: &Arc<Self>,
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
         pg_version: u32,
@@ -2263,7 +2382,7 @@ impl Tenant {
     // Our current tests don't need the background loops.
     #[cfg(test)]
     pub async fn create_test_timeline(
-        &self,
+        self: &Arc<Self>,
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
         pg_version: u32,
@@ -2302,7 +2421,7 @@ impl Tenant {
     #[cfg(test)]
     #[allow(clippy::too_many_arguments)]
     pub async fn create_test_timeline_with_layers(
-        &self,
+        self: &Arc<Self>,
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
         pg_version: u32,
@@ -2439,6 +2558,16 @@ impl Tenant {
                 self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
                     .await?
             }
+            CreateTimelineParams::ImportPgdata(params) => {
+                self.create_timeline_import_pgdata(
+                    params,
+                    ActivateTimelineArgs::Yes {
+                        broker_client: broker_client.clone(),
+                    },
+                    ctx,
+                )
+                .await?
+            }
         };
 
         // At this point we have dropped our guard on [`Self::timelines_creating`], and
@@ -2481,11 +2610,202 @@ impl Tenant {
                 );
                 timeline
             }
+            CreateTimelineResult::ImportSpawned(timeline) => {
+                info!("import task spawned, timeline will become visible and activated once the import is done");
+                timeline
+            }
         };
 
         Ok(activated_timeline)
     }
 
+    /// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
+    /// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
+    /// [`Tenant::timelines`] map when the import completes.
+    /// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
+    /// for the response.
+    async fn create_timeline_import_pgdata(
+        self: &Arc<Tenant>,
+        params: CreateTimelineParamsImportPgdata,
+        activate: ActivateTimelineArgs,
+        ctx: &RequestContext,
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
+        let CreateTimelineParamsImportPgdata {
+            new_timeline_id,
+            location,
+            idempotency_key,
+        } = params;
+
+        let started_at = chrono::Utc::now().naive_utc();
+
+        //
+        // There's probably a simpler way to upload an index part, but, remote_timeline_client
+        // is the canonical way we do it.
+        // - create an empty timeline in-memory
+        // - use its remote_timeline_client to do the upload
+        // - dispose of the uninit timeline
+        // - keep the creation guard alive
+
+        let timeline_create_guard = match self
+            .start_creating_timeline(
+                new_timeline_id,
+                CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
+                    idempotency_key: idempotency_key.clone(),
+                }),
+            )
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(timeline) => {
+                return Ok(CreateTimelineResult::Idempotent(timeline))
+            }
+        };
+
+        let mut uninit_timeline = {
+            let this = &self;
+            let initdb_lsn = Lsn(0);
+            let _ctx = ctx;
+            async move {
+                let new_metadata = TimelineMetadata::new(
+                    // Initialize disk_consistent LSN to 0, The caller must import some data to
+                    // make it valid, before calling finish_creation()
+                    Lsn(0),
+                    None,
+                    None,
+                    Lsn(0),
+                    initdb_lsn,
+                    initdb_lsn,
+                    15,
+                );
+                this.prepare_new_timeline(
+                    new_timeline_id,
+                    &new_metadata,
+                    timeline_create_guard,
+                    initdb_lsn,
+                    None,
+                )
+                .await
+            }
+        }
+        .await?;
+
+        let in_progress = import_pgdata::index_part_format::InProgress {
+            idempotency_key,
+            location,
+            started_at,
+        };
+        let index_part = import_pgdata::index_part_format::Root::V1(
+            import_pgdata::index_part_format::V1::InProgress(in_progress),
+        );
+        uninit_timeline
+            .raw_timeline()
+            .unwrap()
+            .remote_client
+            .schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?;
+
+        // wait_completion happens in caller
+
+        let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();
+
+        tokio::spawn(self.clone().create_timeline_import_pgdata_task(
+            timeline.clone(),
+            index_part,
+            activate,
+            timeline_create_guard,
+        ));
+
+        // NB: the timeline doesn't exist in self.timelines at this point
+        Ok(CreateTimelineResult::ImportSpawned(timeline))
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
+    async fn create_timeline_import_pgdata_task(
+        self: Arc<Tenant>,
+        timeline: Arc<Timeline>,
+        index_part: import_pgdata::index_part_format::Root,
+        activate: ActivateTimelineArgs,
+        timeline_create_guard: TimelineCreateGuard,
+    ) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        info!("starting");
+        scopeguard::defer! {info!("exiting")};
+
+        let res = self
+            .create_timeline_import_pgdata_task_impl(
+                timeline,
+                index_part,
+                activate,
+                timeline_create_guard,
+            )
+            .await;
+        if let Err(err) = &res {
+            error!(?err, "task failed");
+            // TODO sleep & retry, sensitive to tenant shutdown
+            // TODO: allow timeline deletion requests => should cancel the task
+        }
+    }
+
+    async fn create_timeline_import_pgdata_task_impl(
+        self: Arc<Tenant>,
+        timeline: Arc<Timeline>,
+        index_part: import_pgdata::index_part_format::Root,
+        activate: ActivateTimelineArgs,
+        timeline_create_guard: TimelineCreateGuard,
+    ) -> Result<(), anyhow::Error> {
+        let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
+
+        info!("importing pgdata");
+        import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
+            .await
+            .context("import")?;
+        info!("import done");
+
+        //
+        // Reload timeline from remote.
+        // This proves that the remote state is attachable, and it reuses the code.
+        //
+        // TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
+        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
+        // But our activate() call might launch new background tasks after Tenant::shutdown
+        // already went past shutting down the Tenant::timelines, which this timeline here is no part of.
+        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
+        // down while bootstrapping/branching + activating), but, the race condition is much more likely
+        // to manifest because of the long runtime of this import task.
+
+        //        in theory this shouldn't even .await anything except for coop yield
+        info!("shutting down timeline");
+        timeline.shutdown(ShutdownMode::Hard).await;
+        info!("timeline shut down, reloading from remote");
+        // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc<Timeline>
+        // let Some(timeline) = Arc::into_inner(timeline) else {
+        //     anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere");
+        // };
+        let timeline_id = timeline.timeline_id;
+
+        // load from object storage like Tenant::attach does
+        let resources = self.build_timeline_resources(timeline_id);
+        let index_part = resources
+            .remote_client
+            .download_index_file(&self.cancel)
+            .await?;
+        let index_part = match index_part {
+            MaybeDeletedIndexPart::Deleted(_) => {
+                // likely concurrent delete call, cplane should prevent this
+                anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but")
+            }
+            MaybeDeletedIndexPart::IndexPart(p) => p,
+        };
+        let metadata = index_part.metadata.clone();
+        self
+            .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
+                create_guard: timeline_create_guard, activate, }, &ctx)
+            .await?
+            .ready_to_activate()
+            .context("implementation error: reloaded timeline still needs import after import reported success")?;
+
+        anyhow::Ok(())
+    }
+
     pub(crate) async fn delete_timeline(
         self: Arc<Self>,
         timeline_id: TimelineId,
@@ -3337,6 +3657,13 @@ where
     Ok(result)
 }
 
+enum ActivateTimelineArgs {
+    Yes {
+        broker_client: storage_broker::BrokerClientChannel,
+    },
+    No,
+}
+
 impl Tenant {
     pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
         self.tenant_conf.load().tenant_conf.clone()
@@ -3520,6 +3847,7 @@ impl Tenant {
     /// `validate_ancestor == false` is used when a timeline is created for deletion
     /// and we might not have the ancestor present anymore which is fine for to be
     /// deleted timelines.
+    #[allow(clippy::too_many_arguments)]
     fn create_timeline_struct(
         &self,
         new_timeline_id: TimelineId,
@@ -4283,16 +4611,17 @@ impl Tenant {
     /// If the timeline was already created in the meantime, we check whether this
     /// request conflicts or is idempotent , based on `state`.
     async fn start_creating_timeline(
-        &self,
+        self: &Arc<Self>,
         new_timeline_id: TimelineId,
         idempotency: CreateTimelineIdempotency,
-    ) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
+    ) -> Result<StartCreatingTimelineResult, CreateTimelineError> {
         let allow_offloaded = false;
         match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
             Ok(create_guard) => {
                 pausable_failpoint!("timeline-creation-after-uninit");
                 Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
             }
+            Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown),
             Err(TimelineExclusionError::AlreadyCreating) => {
                 // Creation is in progress, we cannot create it again, and we cannot
                 // check if this request matches the existing one, so caller must try
@@ -4582,7 +4911,7 @@ impl Tenant {
         &'a self,
         new_timeline_id: TimelineId,
         new_metadata: &TimelineMetadata,
-        create_guard: TimelineCreateGuard<'a>,
+        create_guard: TimelineCreateGuard,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
     ) -> anyhow::Result<UninitializedTimeline<'a>> {
@@ -4642,7 +4971,7 @@ impl Tenant {
     /// The `allow_offloaded` parameter controls whether to tolerate the existence of
     /// offloaded timelines or not.
     fn create_timeline_create_guard(
-        &self,
+        self: &Arc<Self>,
         timeline_id: TimelineId,
         idempotency: CreateTimelineIdempotency,
         allow_offloaded: bool,
@@ -4902,48 +5231,16 @@ async fn run_initdb(
 
     let _permit = INIT_DB_SEMAPHORE.acquire().await;
 
-    let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
-    initdb_command
-        .args(["--pgdata", initdb_target_dir.as_ref()])
-        .args(["--username", &conf.superuser])
-        .args(["--encoding", "utf8"])
-        .args(["--locale", &conf.locale])
-        .arg("--no-instructions")
-        .arg("--no-sync")
-        .env_clear()
-        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
-        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
-        .stdin(std::process::Stdio::null())
-        // stdout invocation produces the same output every time, we don't need it
-        .stdout(std::process::Stdio::null())
-        // we would be interested in the stderr output, if there was any
-        .stderr(std::process::Stdio::piped());
-
-    // Before version 14, only the libc provide was available.
-    if pg_version > 14 {
-        // Version 17 brought with it a builtin locale provider which only provides
-        // C and C.UTF-8. While being safer for collation purposes since it is
-        // guaranteed to be consistent throughout a major release, it is also more
-        // performant.
-        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
-
-        initdb_command.args(["--locale-provider", locale_provider]);
-    }
-
-    let initdb_proc = initdb_command.spawn()?;
-
-    // Ideally we'd select here with the cancellation token, but the problem is that
-    // we can't safely terminate initdb: it launches processes of its own, and killing
-    // initdb doesn't kill them. After we return from this function, we want the target
-    // directory to be able to be cleaned up.
-    // See https://github.com/neondatabase/neon/issues/6385
-    let initdb_output = initdb_proc.wait_with_output().await?;
-    if !initdb_output.status.success() {
-        return Err(InitdbError::Failed(
-            initdb_output.status,
-            initdb_output.stderr,
-        ));
-    }
+    let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
+        superuser: &conf.superuser,
+        locale: &conf.locale,
+        initdb_bin: &initdb_bin_path,
+        pg_version,
+        library_search_path: &initdb_lib_dir,
+        pgdata: initdb_target_dir,
+    })
+    .await
+    .map_err(InitdbError::Inner);
 
     // This isn't true cancellation support, see above. Still return an error to
     // excercise the cancellation code path.
@@ -4951,7 +5248,7 @@ async fn run_initdb(
         return Err(InitdbError::Cancelled);
     }
 
-    Ok(())
+    res
 }
 
 /// Dump contents of a layer file to stdout.
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4c88282214..007bd3eef0 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -244,6 +244,7 @@ use self::index::IndexPart;
 use super::config::AttachedLocationConfig;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
+use super::timeline::import_pgdata;
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::{DeleteTimelineError, Generation};
 
@@ -813,6 +814,18 @@ impl RemoteTimelineClient {
         Ok(need_wait)
     }
 
+    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
+        self: &Arc<Self>,
+        state: Option<import_pgdata::index_part_format::Root>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.import_pgdata = state;
+        self.schedule_index_upload(upload_queue)?;
+        Ok(())
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index efcd20d1bf..d632e595ad 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -706,7 +706,7 @@ where
     .and_then(|x| x)
 }
 
-async fn download_retry_forever<T, O, F>(
+pub(crate) async fn download_retry_forever<T, O, F>(
     op: O,
     description: &str,
     cancel: &CancellationToken,
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index d8a881a2c4..506990fb2f 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -12,6 +12,7 @@ use utils::id::TimelineId;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::timeline::import_pgdata;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
 
@@ -37,6 +38,13 @@ pub struct IndexPart {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub archived_at: Option<NaiveDateTime>,
 
+    /// This field supports import-from-pgdata ("fast imports" platform feature).
+    /// We don't currently use fast imports, so, this field is None for all production timelines.
+    /// See <https://github.com/neondatabase/neon/pull/9218> for more information.
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub import_pgdata: Option<import_pgdata::index_part_format::Root>,
+
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -90,10 +98,11 @@ impl IndexPart {
     /// - 7: metadata_bytes is no longer written, but still read
     /// - 8: added `archived_at`
     /// - 9: +gc_blocking
-    const LATEST_VERSION: usize = 9;
+    /// - 10: +import_pgdata
+    const LATEST_VERSION: usize = 10;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -108,6 +117,7 @@ impl IndexPart {
             lineage: Default::default(),
             gc_blocking: None,
             last_aux_file_policy: None,
+            import_pgdata: None,
         }
     }
 
@@ -381,6 +391,7 @@ mod tests {
             lineage: Lineage::default(),
             gc_blocking: None,
             last_aux_file_policy: None,
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -425,6 +436,7 @@ mod tests {
             lineage: Lineage::default(),
             gc_blocking: None,
             last_aux_file_policy: None,
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -470,6 +482,7 @@ mod tests {
             lineage: Lineage::default(),
             gc_blocking: None,
             last_aux_file_policy: None,
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -518,6 +531,7 @@ mod tests {
             lineage: Lineage::default(),
             gc_blocking: None,
             last_aux_file_policy: None,
+            import_pgdata: None,
         };
 
         let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -561,6 +575,7 @@ mod tests {
             lineage: Lineage::default(),
             gc_blocking: None,
             last_aux_file_policy: None,
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -607,6 +622,7 @@ mod tests {
             },
             gc_blocking: None,
             last_aux_file_policy: None,
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -658,6 +674,7 @@ mod tests {
             },
             gc_blocking: None,
             last_aux_file_policy: Some(AuxFilePolicy::V2),
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -714,6 +731,7 @@ mod tests {
             lineage: Default::default(),
             gc_blocking: None,
             last_aux_file_policy: Default::default(),
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -771,6 +789,7 @@ mod tests {
             lineage: Default::default(),
             gc_blocking: None,
             last_aux_file_policy: Default::default(),
+            import_pgdata: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -833,6 +852,83 @@ mod tests {
             }),
             last_aux_file_policy: Default::default(),
             archived_at: None,
+            import_pgdata: None,
+        };
+
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v10_importpgdata_is_parsed() {
+        let example = r#"{
+            "version": 10,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            },
+            "import_pgdata": {
+                "V1": {
+                    "Done": {
+                        "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
+                        "started_at": "2024-11-13T09:23:42.123",
+                        "finished_at": "2024-11-13T09:42:23.123"
+                    }
+                }
+            }
+        }"#;
+
+        let expected = IndexPart {
+            version: 10,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+            import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
+                started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
+                finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
+                idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
+            })))
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d1285e7c8a..4881be33a6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4,6 +4,7 @@ pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
 pub(crate) mod handle;
+pub(crate) mod import_pgdata;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -2708,20 +2709,23 @@ impl Timeline {
                 {
                     Some(cancel) => cancel.cancel(),
                     None => {
-                        let state = self.current_state();
-                        if matches!(
-                            state,
-                            TimelineState::Broken { .. } | TimelineState::Stopping
-                        ) {
-
-                            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
-                            // Don't make noise.
-                        } else {
-                            warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
-                            debug_assert!(false);
+                        match self.current_state() {
+                            TimelineState::Broken { .. } | TimelineState::Stopping => {
+                                // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
+                                // Don't make noise.
+                            }
+                            TimelineState::Loading => {
+                                // Import does not return an activated timeline.
+                                info!("discarding priority boost for logical size calculation because timeline is not yet active");
+                            }
+                            TimelineState::Active => {
+                                // activation should be setting the once cell
+                                warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                                debug_assert!(false);
+                            }
                         }
                     }
-                };
+                }
             }
         }
 
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
new file mode 100644
index 0000000000..de56468580
--- /dev/null
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -0,0 +1,218 @@
+use std::sync::Arc;
+
+use anyhow::{bail, Context};
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::{info, info_span, Instrument};
+use utils::lsn::Lsn;
+
+use crate::{context::RequestContext, tenant::metadata::TimelineMetadata};
+
+use super::Timeline;
+
+mod flow;
+mod importbucket_client;
+mod importbucket_format;
+pub(crate) mod index_part_format;
+pub(crate) mod upcall_api;
+
+pub async fn doit(
+    timeline: &Arc<Timeline>,
+    index_part: index_part_format::Root,
+    ctx: &RequestContext,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let index_part_format::Root::V1(v1) = index_part;
+    let index_part_format::InProgress {
+        location,
+        idempotency_key,
+        started_at,
+    } = match v1 {
+        index_part_format::V1::Done(_) => return Ok(()),
+        index_part_format::V1::InProgress(in_progress) => in_progress,
+    };
+
+    let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
+
+    info!("get spec early so we know we'll be able to upcall when done");
+    let Some(spec) = storage.get_spec().await? else {
+        bail!("spec not found")
+    };
+
+    let upcall_client =
+        upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?;
+
+    //
+    // send an early progress update to clean up k8s job early and generate potentially useful logs
+    //
+    info!("send early progress update");
+    upcall_client
+        .send_progress_until_success(&spec)
+        .instrument(info_span!("early_progress_update"))
+        .await?;
+
+    let status_prefix = RemotePath::from_string("status").unwrap();
+
+    //
+    // See if shard is done.
+    // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing.
+    //
+    let shard_status_key =
+        status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug()));
+    let shard_status: Option<importbucket_format::ShardStatus> =
+        storage.get_json(&shard_status_key).await?;
+    info!(?shard_status, "peeking shard status");
+    if shard_status.map(|st| st.done).unwrap_or(false) {
+        info!("shard status indicates that the shard is done, skipping import");
+    } else {
+        // TODO: checkpoint the progress into the IndexPart instead of restarting
+        // from the beginning.
+
+        //
+        // Wipe the slate clean - the flow does not allow resuming.
+        // We can implement resuming in the future by checkpointing the progress into the IndexPart.
+        //
+        info!("wipe the slate clean");
+        {
+            // TODO: do we need to hold GC lock for this?
+            let mut guard = timeline.layers.write().await;
+            assert!(
+                guard.layer_map()?.open_layer.is_none(),
+                "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
+            );
+            let all_layers_keys = guard.all_persistent_layers();
+            let all_layers: Vec<_> = all_layers_keys
+                .iter()
+                .map(|key| guard.get_from_key(key))
+                .collect();
+            let open = guard.open_mut().context("open_mut")?;
+
+            timeline.remote_client.schedule_gc_update(&all_layers)?;
+            open.finish_gc_timeline(&all_layers);
+        }
+
+        //
+        // Wait for pgdata to finish uploading
+        //
+        info!("wait for pgdata to reach status 'done'");
+        let pgdata_status_key = status_prefix.join("pgdata");
+        loop {
+            let res = async {
+                let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
+                    .get_json(&pgdata_status_key)
+                    .await
+                    .context("get pgdata status")?;
+                info!(?pgdata_status, "peeking pgdata status");
+                if pgdata_status.map(|st| st.done).unwrap_or(false) {
+                    Ok(())
+                } else {
+                    Err(anyhow::anyhow!("pgdata not done yet"))
+                }
+            }
+            .await;
+            match res {
+                Ok(_) => break,
+                Err(err) => {
+                    info!(?err, "indefintely waiting for pgdata to finish");
+                    if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
+                        .await
+                        .is_ok()
+                    {
+                        bail!("cancelled while waiting for pgdata");
+                    }
+                }
+            }
+        }
+
+        //
+        // Do the import
+        //
+        info!("do the import");
+        let control_file = storage.get_control_file().await?;
+        let base_lsn = control_file.base_lsn();
+
+        info!("update TimelineMetadata based on LSNs from control file");
+        {
+            let pg_version = control_file.pg_version();
+            let _ctx: &RequestContext = ctx;
+            async move {
+                // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
+                // checkpoint record, and prev_record_lsn should point to its beginning.
+                // We should read the real end of the record from the WAL, but here we
+                // just fake it.
+                let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
+                let prev_record_lsn = base_lsn;
+                let metadata = TimelineMetadata::new(
+                    disk_consistent_lsn,
+                    Some(prev_record_lsn),
+                    None,     // no ancestor
+                    Lsn(0),   // no ancestor lsn
+                    base_lsn, // latest_gc_cutoff_lsn
+                    base_lsn, // initdb_lsn
+                    pg_version,
+                );
+
+                let _start_lsn = disk_consistent_lsn + 1;
+
+                timeline
+                    .remote_client
+                    .schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+                timeline.remote_client.wait_completion().await?;
+
+                anyhow::Ok(())
+            }
+        }
+        .await?;
+
+        flow::run(
+            timeline.clone(),
+            base_lsn,
+            control_file,
+            storage.clone(),
+            ctx,
+        )
+        .await?;
+
+        //
+        // Communicate that shard is done.
+        //
+        storage
+            .put_json(
+                &shard_status_key,
+                &importbucket_format::ShardStatus { done: true },
+            )
+            .await
+            .context("put shard status")?;
+    }
+
+    //
+    // Ensure at-least-once deliver of the upcall to cplane
+    // before we mark the task as done and never come here again.
+    //
+    info!("send final progress update");
+    upcall_client
+        .send_progress_until_success(&spec)
+        .instrument(info_span!("final_progress_update"))
+        .await?;
+
+    //
+    // Mark as done in index_part.
+    // This makes subsequent timeline loads enter the normal load code path
+    // instead of spawning the import task and calling this here function.
+    //
+    info!("mark import as complete in index part");
+    timeline
+        .remote_client
+        .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1(
+            index_part_format::V1::Done(index_part_format::Done {
+                idempotency_key,
+                started_at,
+                finished_at: chrono::Utc::now().naive_utc(),
+            }),
+        )))?;
+
+    timeline.remote_client.wait_completion().await?;
+
+    Ok(())
+}
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
new file mode 100644
index 0000000000..cbd4168c06
--- /dev/null
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -0,0 +1,798 @@
+//! Import a PGDATA directory into an empty root timeline.
+//!
+//! This module is adapted hackathon code by Heikki and Stas.
+//! Other code in the parent module was written by Christian as part of a customer PoC.
+//!
+//! The hackathon code was producing image layer files as a free-standing program.
+//!
+//! It has been modified to
+//! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard)
+//! - => sharding-awareness: produce image layers with only the data relevant for this shard
+//! - => S3 as the source for the PGDATA instead of local filesystem
+//!
+//! TODOs before productionization:
+//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding.
+//!   => produced image layers likely too small.
+//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size.
+//! - asserts / unwraps need to be replaced with errors
+//! - don't trust remote objects will be small (=prevent OOMs in those cases)
+//!     - limit all in-memory buffers in size, or download to disk and read from there
+//! - limit task concurrency
+//! - generally play nice with other tenants in the system
+//!   - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits
+//!   - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc
+//! - integrate with layer eviction system
+//! - audit for Tenant::cancel nor Timeline::cancel responsivity
+//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!)
+//!
+//! An incomplete set of TODOs from the Hackathon:
+//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)
+
+use std::sync::Arc;
+
+use anyhow::{bail, ensure};
+use bytes::Bytes;
+
+use itertools::Itertools;
+use pageserver_api::{
+    key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY},
+    reltag::RelTag,
+    shard::ShardIdentity,
+};
+use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ};
+use tokio::task::JoinSet;
+use tracing::{debug, info_span, instrument, Instrument};
+
+use crate::{
+    assert_u64_eq_usize::UsizeIsU64,
+    pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory},
+};
+use crate::{
+    context::{DownloadBehavior, RequestContext},
+    pgdatadir_mapping::{DbDirectory, RelDirectory},
+    task_mgr::TaskKind,
+    tenant::storage_layer::{ImageLayerWriter, Layer},
+};
+
+use pageserver_api::key::Key;
+use pageserver_api::key::{
+    slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY,
+    TWOPHASEDIR_KEY,
+};
+use pageserver_api::keyspace::singleton_range;
+use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range};
+use pageserver_api::reltag::SlruKind;
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+
+use std::collections::HashSet;
+use std::ops::Range;
+
+use super::{
+    importbucket_client::{ControlFile, RemoteStorageWrapper},
+    Timeline,
+};
+
+use remote_storage::RemotePath;
+
+pub async fn run(
+    timeline: Arc<Timeline>,
+    pgdata_lsn: Lsn,
+    control_file: ControlFile,
+    storage: RemoteStorageWrapper,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    Flow {
+        timeline,
+        pgdata_lsn,
+        control_file,
+        tasks: Vec::new(),
+        storage,
+    }
+    .run(ctx)
+    .await
+}
+
+struct Flow {
+    timeline: Arc<Timeline>,
+    pgdata_lsn: Lsn,
+    control_file: ControlFile,
+    tasks: Vec<AnyImportTask>,
+    storage: RemoteStorageWrapper,
+}
+
+impl Flow {
+    /// Perform the ingestion into [`Self::timeline`].
+    /// Assumes the timeline is empty (= no layers).
+    pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+        let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
+
+        self.pgdata_lsn = pgdata_lsn;
+
+        let datadir = PgDataDir::new(&self.storage).await?;
+
+        // Import dbdir (00:00:00 keyspace)
+        // This is just constructed here, but will be written to the image layer in the first call to import_db()
+        let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory {
+            dbdirs: datadir
+                .dbs
+                .iter()
+                .map(|db| ((db.spcnode, db.dboid), true))
+                .collect(),
+        })?);
+        self.tasks
+            .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into());
+
+        // Import databases (00:spcnode:dbnode keyspace for each db)
+        for db in datadir.dbs {
+            self.import_db(&db).await?;
+        }
+
+        // Import SLRUs
+
+        // pg_xact (01:00 keyspace)
+        self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+            .await?;
+        // pg_multixact/members (01:01 keyspace)
+        self.import_slru(
+            SlruKind::MultiXactMembers,
+            &self.storage.pgdata().join("pg_multixact/members"),
+        )
+        .await?;
+        // pg_multixact/offsets (01:02 keyspace)
+        self.import_slru(
+            SlruKind::MultiXactOffsets,
+            &self.storage.pgdata().join("pg_multixact/offsets"),
+        )
+        .await?;
+
+        // Import pg_twophase.
+        // TODO: as empty
+        let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
+            xids: HashSet::new(),
+        })?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                TWOPHASEDIR_KEY,
+                Bytes::from(twophasedir_buf),
+            )));
+
+        // Controlfile, checkpoint
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                CONTROLFILE_KEY,
+                self.control_file.control_file_buf().clone(),
+            )));
+
+        let checkpoint_buf = self
+            .control_file
+            .control_file_data()
+            .checkPointCopy
+            .encode()?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                CHECKPOINT_KEY,
+                checkpoint_buf,
+            )));
+
+        // Assigns parts of key space to later parallel jobs
+        let mut last_end_key = Key::MIN;
+        let mut current_chunk = Vec::new();
+        let mut current_chunk_size: usize = 0;
+        let mut parallel_jobs = Vec::new();
+        for task in std::mem::take(&mut self.tasks).into_iter() {
+            if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 {
+                let key_range = last_end_key..task.key_range().start;
+                parallel_jobs.push(ChunkProcessingJob::new(
+                    key_range.clone(),
+                    std::mem::take(&mut current_chunk),
+                    &self,
+                ));
+                last_end_key = key_range.end;
+                current_chunk_size = 0;
+            }
+            current_chunk_size += task.total_size();
+            current_chunk.push(task);
+        }
+        parallel_jobs.push(ChunkProcessingJob::new(
+            last_end_key..Key::MAX,
+            current_chunk,
+            &self,
+        ));
+
+        // Start all jobs simultaneosly
+        let mut work = JoinSet::new();
+        // TODO: semaphore?
+        for job in parallel_jobs {
+            let ctx: RequestContext =
+                ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
+            work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job")));
+        }
+        let mut results = Vec::new();
+        while let Some(result) = work.join_next().await {
+            match result {
+                Ok(res) => {
+                    results.push(res);
+                }
+                Err(_joinset_err) => {
+                    results.push(Err(anyhow::anyhow!(
+                        "parallel job panicked or cancelled, check pageserver logs"
+                    )));
+                }
+            }
+        }
+
+        if results.iter().all(|r| r.is_ok()) {
+            Ok(())
+        } else {
+            let mut msg = String::new();
+            for result in results {
+                if let Err(err) = result {
+                    msg.push_str(&format!("{err:?}\n\n"));
+                }
+            }
+            bail!("Some parallel jobs failed:\n\n{msg}");
+        }
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
+    async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> {
+        debug!("start");
+        scopeguard::defer! {
+            debug!("return");
+        }
+
+        // Import relmap (00:spcnode:dbnode:00:*:00)
+        let relmap_key = relmap_file_key(db.spcnode, db.dboid);
+        debug!("Constructing relmap entry, key {relmap_key}");
+        let relmap_path = db.path.join("pg_filenode.map");
+        let relmap_buf = self.storage.get(&relmap_path).await?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                relmap_key, relmap_buf,
+            )));
+
+        // Import reldir (00:spcnode:dbnode:00:*:01)
+        let reldir_key = rel_dir_to_key(db.spcnode, db.dboid);
+        debug!("Constructing reldirs entry, key {reldir_key}");
+        let reldir_buf = RelDirectory::ser(&RelDirectory {
+            rels: db
+                .files
+                .iter()
+                .map(|f| (f.rel_tag.relnode, f.rel_tag.forknum))
+                .collect(),
+        })?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                reldir_key,
+                Bytes::from(reldir_buf),
+            )));
+
+        // Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last
+        // segment in a given relation (00:spcnode:dbnode:reloid:fork:ff)
+        for file in &db.files {
+            debug!(%file.path, %file.filesize, "importing file");
+            let len = file.filesize;
+            ensure!(len % 8192 == 0);
+            let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192);
+            let start_key = rel_block_to_key(file.rel_tag, start_blk);
+            let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
+            self.tasks
+                .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(
+                    *self.timeline.get_shard_identity(),
+                    start_key..end_key,
+                    &file.path,
+                    self.storage.clone(),
+                )));
+
+            // Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff)
+            if let Some(nblocks) = file.nblocks {
+                let size_key = rel_size_to_key(file.rel_tag);
+                //debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}");
+                let buf = nblocks.to_le_bytes();
+                self.tasks
+                    .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                        size_key,
+                        Bytes::from(buf.to_vec()),
+                    )));
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
+        let segments = self.storage.listfilesindir(path).await?;
+        let segments: Vec<(String, u32, usize)> = segments
+            .into_iter()
+            .filter_map(|(path, size)| {
+                let filename = path.object_name()?;
+                let segno = u32::from_str_radix(filename, 16).ok()?;
+                Some((filename.to_string(), segno, size))
+            })
+            .collect();
+
+        // Write SlruDir
+        let slrudir_key = slru_dir_to_key(kind);
+        let segnos: HashSet<u32> = segments
+            .iter()
+            .map(|(_path, segno, _size)| *segno)
+            .collect();
+        let slrudir = SlruSegmentDirectory { segments: segnos };
+        let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                slrudir_key,
+                Bytes::from(slrudir_buf),
+            )));
+
+        for (segpath, segno, size) in segments {
+            // SlruSegBlocks for each segment
+            let p = path.join(&segpath);
+            let file_size = size;
+            ensure!(file_size % 8192 == 0);
+            let nblocks = u32::try_from(file_size / 8192)?;
+            let start_key = slru_block_to_key(kind, segno, 0);
+            let end_key = slru_block_to_key(kind, segno, nblocks);
+            debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
+            self.tasks
+                .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
+                    *self.timeline.get_shard_identity(),
+                    start_key..end_key,
+                    &p,
+                    self.storage.clone(),
+                )));
+
+            // Followed by SlruSegSize
+            let segsize_key = slru_segment_size_to_key(kind, segno);
+            let segsize_buf = nblocks.to_le_bytes();
+            self.tasks
+                .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                    segsize_key,
+                    Bytes::copy_from_slice(&segsize_buf),
+                )));
+        }
+        Ok(())
+    }
+}
+
+//
+// dbdir iteration tools
+//
+
+struct PgDataDir {
+    pub dbs: Vec<PgDataDirDb>, // spcnode, dboid, path
+}
+
+struct PgDataDirDb {
+    pub spcnode: u32,
+    pub dboid: u32,
+    pub path: RemotePath,
+    pub files: Vec<PgDataDirDbFile>,
+}
+
+struct PgDataDirDbFile {
+    pub path: RemotePath,
+    pub rel_tag: RelTag,
+    pub segno: u32,
+    pub filesize: usize,
+    // Cummulative size of the given fork, set only for the last segment of that fork
+    pub nblocks: Option<usize>,
+}
+
+impl PgDataDir {
+    async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result<Self> {
+        let datadir_path = storage.pgdata();
+        // Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first
+        // Traverse database in increasing oid order
+
+        let basedir = &datadir_path.join("base");
+        let db_oids: Vec<_> = storage
+            .listdir(basedir)
+            .await?
+            .into_iter()
+            .filter_map(|path| path.object_name().and_then(|name| name.parse::<u32>().ok()))
+            .sorted()
+            .collect();
+        debug!(?db_oids, "found databases");
+        let mut databases = Vec::new();
+        for dboid in db_oids {
+            databases.push(
+                PgDataDirDb::new(
+                    storage,
+                    &basedir.join(dboid.to_string()),
+                    pg_constants::DEFAULTTABLESPACE_OID,
+                    dboid,
+                    &datadir_path,
+                )
+                .await?,
+            );
+        }
+
+        // special case for global catalogs
+        databases.push(
+            PgDataDirDb::new(
+                storage,
+                &datadir_path.join("global"),
+                postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
+                0,
+                &datadir_path,
+            )
+            .await?,
+        );
+
+        databases.sort_by_key(|db| (db.spcnode, db.dboid));
+
+        Ok(Self { dbs: databases })
+    }
+}
+
+impl PgDataDirDb {
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))]
+    async fn new(
+        storage: &RemoteStorageWrapper,
+        db_path: &RemotePath,
+        spcnode: u32,
+        dboid: u32,
+        datadir_path: &RemotePath,
+    ) -> anyhow::Result<Self> {
+        let mut files: Vec<PgDataDirDbFile> = storage
+            .listfilesindir(db_path)
+            .await?
+            .into_iter()
+            .filter_map(|(path, size)| {
+                debug!(%path, %size, "found file in dbdir");
+                path.object_name().and_then(|name| {
+                    // returns (relnode, forknum, segno)
+                    parse_relfilename(name).ok().map(|x| (size, x))
+                })
+            })
+            .sorted_by_key(|(_, relfilename)| *relfilename)
+            .map(|(filesize, (relnode, forknum, segno))| {
+                let rel_tag = RelTag {
+                    spcnode,
+                    dbnode: dboid,
+                    relnode,
+                    forknum,
+                };
+
+                let path = datadir_path.join(rel_tag.to_segfile_name(segno));
+                assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error
+                let nblocks = filesize / BLCKSZ as usize;
+
+                PgDataDirDbFile {
+                    path,
+                    filesize,
+                    rel_tag,
+                    segno,
+                    nblocks: Some(nblocks), // first non-cummulative sizes
+                }
+            })
+            .collect();
+
+        // Set cummulative sizes. Do all of that math here, so that later we could easier
+        // parallelize over segments and know with which segments we need to write relsize
+        // entry.
+        let mut cumulative_nblocks: usize = 0;
+        let mut prev_rel_tag: Option<RelTag> = None;
+        for i in 0..files.len() {
+            if prev_rel_tag == Some(files[i].rel_tag) {
+                cumulative_nblocks += files[i].nblocks.unwrap();
+            } else {
+                cumulative_nblocks = files[i].nblocks.unwrap();
+            }
+
+            files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag {
+                Some(cumulative_nblocks)
+            } else {
+                None
+            };
+
+            prev_rel_tag = Some(files[i].rel_tag);
+        }
+
+        Ok(PgDataDirDb {
+            files,
+            path: db_path.clone(),
+            spcnode,
+            dboid,
+        })
+    }
+}
+
+trait ImportTask {
+    fn key_range(&self) -> Range<Key>;
+
+    fn total_size(&self) -> usize {
+        // TODO: revisit this
+        if is_contiguous_range(&self.key_range()) {
+            contiguous_range_len(&self.key_range()) as usize * 8192
+        } else {
+            u32::MAX as usize
+        }
+    }
+
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize>;
+}
+
+struct ImportSingleKeyTask {
+    key: Key,
+    buf: Bytes,
+}
+
+impl ImportSingleKeyTask {
+    fn new(key: Key, buf: Bytes) -> Self {
+        ImportSingleKeyTask { key, buf }
+    }
+}
+
+impl ImportTask for ImportSingleKeyTask {
+    fn key_range(&self) -> Range<Key> {
+        singleton_range(self.key)
+    }
+
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        layer_writer.put_image(self.key, self.buf, ctx).await?;
+        Ok(1)
+    }
+}
+
+struct ImportRelBlocksTask {
+    shard_identity: ShardIdentity,
+    key_range: Range<Key>,
+    path: RemotePath,
+    storage: RemoteStorageWrapper,
+}
+
+impl ImportRelBlocksTask {
+    fn new(
+        shard_identity: ShardIdentity,
+        key_range: Range<Key>,
+        path: &RemotePath,
+        storage: RemoteStorageWrapper,
+    ) -> Self {
+        ImportRelBlocksTask {
+            shard_identity,
+            key_range,
+            path: path.clone(),
+            storage,
+        }
+    }
+}
+
+impl ImportTask for ImportRelBlocksTask {
+    fn key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))]
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        debug!("Importing relation file");
+
+        let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
+        let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?;
+        assert_eq!(rel_tag, rel_tag_end);
+
+        let ranges = (start_blk..end_blk)
+            .enumerate()
+            .filter_map(|(i, blknum)| {
+                let key = rel_block_to_key(rel_tag, blknum);
+                if self.shard_identity.is_key_disposable(&key) {
+                    return None;
+                }
+                let file_offset = i.checked_mul(8192).unwrap();
+                Some((
+                    vec![key],
+                    file_offset,
+                    file_offset.checked_add(8192).unwrap(),
+                ))
+            })
+            .coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| {
+                assert_eq!(key.len(), 1);
+                assert!(!acc.is_empty());
+                assert!(acc_end > acc_start);
+                if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ {
+                    acc.push(key.pop().unwrap());
+                    Ok((acc, acc_start, end))
+                } else {
+                    Err(((acc, acc_start, acc_end), (key, start, end)))
+                }
+            });
+
+        let mut nimages = 0;
+        for (keys, range_start, range_end) in ranges {
+            let range_buf = self
+                .storage
+                .get_range(&self.path, range_start.into_u64(), range_end.into_u64())
+                .await?;
+            let mut buf = Bytes::from(range_buf);
+            // TODO: batched writes
+            for key in keys {
+                let image = buf.split_to(8192);
+                layer_writer.put_image(key, image, ctx).await?;
+                nimages += 1;
+            }
+        }
+
+        Ok(nimages)
+    }
+}
+
+struct ImportSlruBlocksTask {
+    shard_identity: ShardIdentity,
+    key_range: Range<Key>,
+    path: RemotePath,
+    storage: RemoteStorageWrapper,
+}
+
+impl ImportSlruBlocksTask {
+    fn new(
+        shard_identity: ShardIdentity,
+        key_range: Range<Key>,
+        path: &RemotePath,
+        storage: RemoteStorageWrapper,
+    ) -> Self {
+        ImportSlruBlocksTask {
+            shard_identity,
+            key_range,
+            path: path.clone(),
+            storage,
+        }
+    }
+}
+
+impl ImportTask for ImportSlruBlocksTask {
+    fn key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        debug!("Importing SLRU segment file {}", self.path);
+        let buf = self.storage.get(&self.path).await?;
+
+        let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
+        let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
+        let mut blknum = start_blk;
+        let mut nimages = 0;
+        let mut file_offset = 0;
+        while blknum < end_blk {
+            let key = slru_block_to_key(kind, segno, blknum);
+            assert!(
+                !self.shard_identity.is_key_disposable(&key),
+                "SLRU keys need to go into every shard"
+            );
+            let buf = &buf[file_offset..(file_offset + 8192)];
+            file_offset += 8192;
+            layer_writer
+                .put_image(key, Bytes::copy_from_slice(buf), ctx)
+                .await?;
+            blknum += 1;
+            nimages += 1;
+        }
+        Ok(nimages)
+    }
+}
+
+enum AnyImportTask {
+    SingleKey(ImportSingleKeyTask),
+    RelBlocks(ImportRelBlocksTask),
+    SlruBlocks(ImportSlruBlocksTask),
+}
+
+impl ImportTask for AnyImportTask {
+    fn key_range(&self) -> Range<Key> {
+        match self {
+            Self::SingleKey(t) => t.key_range(),
+            Self::RelBlocks(t) => t.key_range(),
+            Self::SlruBlocks(t) => t.key_range(),
+        }
+    }
+    /// returns the number of images put into the `layer_writer`
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        match self {
+            Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
+            Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
+            Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
+        }
+    }
+}
+
+impl From<ImportSingleKeyTask> for AnyImportTask {
+    fn from(t: ImportSingleKeyTask) -> Self {
+        Self::SingleKey(t)
+    }
+}
+
+impl From<ImportRelBlocksTask> for AnyImportTask {
+    fn from(t: ImportRelBlocksTask) -> Self {
+        Self::RelBlocks(t)
+    }
+}
+
+impl From<ImportSlruBlocksTask> for AnyImportTask {
+    fn from(t: ImportSlruBlocksTask) -> Self {
+        Self::SlruBlocks(t)
+    }
+}
+
+struct ChunkProcessingJob {
+    timeline: Arc<Timeline>,
+    range: Range<Key>,
+    tasks: Vec<AnyImportTask>,
+
+    pgdata_lsn: Lsn,
+}
+
+impl ChunkProcessingJob {
+    fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &Flow) -> Self {
+        assert!(env.pgdata_lsn.is_valid());
+        Self {
+            timeline: env.timeline.clone(),
+            range,
+            tasks,
+            pgdata_lsn: env.pgdata_lsn,
+        }
+    }
+
+    async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> {
+        let mut writer = ImageLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            &self.range,
+            self.pgdata_lsn,
+            ctx,
+        )
+        .await?;
+
+        let mut nimages = 0;
+        for task in self.tasks {
+            nimages += task.doit(&mut writer, ctx).await?;
+        }
+
+        let resident_layer = if nimages > 0 {
+            let (desc, path) = writer.finish(ctx).await?;
+            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?
+        } else {
+            // dropping the writer cleans up
+            return Ok(());
+        };
+
+        // this is sharing the same code as create_image_layers
+        let mut guard = self.timeline.layers.write().await;
+        guard
+            .open_mut()?
+            .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics);
+        crate::tenant::timeline::drop_wlock(guard);
+
+        // Schedule the layer for upload but don't add barriers such as
+        // wait for completion or index upload, so we don't inhibit upload parallelism.
+        // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?)
+        // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level.
+        self.timeline
+            .remote_client
+            .schedule_layer_file_upload(resident_layer)?;
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
new file mode 100644
index 0000000000..8d5ab1780f
--- /dev/null
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -0,0 +1,315 @@
+use std::{ops::Bound, sync::Arc};
+
+use anyhow::Context;
+use bytes::Bytes;
+use postgres_ffi::ControlFileData;
+use remote_storage::{
+    Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath,
+};
+use serde::de::DeserializeOwned;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, info, instrument};
+use utils::lsn::Lsn;
+
+use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf};
+
+use super::{importbucket_format, index_part_format};
+
+pub async fn new(
+    conf: &'static PageServerConf,
+    location: &index_part_format::Location,
+    cancel: CancellationToken,
+) -> Result<RemoteStorageWrapper, anyhow::Error> {
+    // FIXME: we probably want some timeout, and we might be able to assume the max file
+    // size on S3 is 1GiB (postgres segment size). But the problem is that the individual
+    // downloaders don't know enough about concurrent downloads to make a guess on the
+    // expected bandwidth and resulting best timeout.
+    let timeout = std::time::Duration::from_secs(24 * 60 * 60);
+    let location_storage = match location {
+        #[cfg(feature = "testing")]
+        index_part_format::Location::LocalFs { path } => {
+            GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?)
+        }
+        index_part_format::Location::AwsS3 {
+            region,
+            bucket,
+            key,
+        } => {
+            // TODO: think about security implications of letting the client specify the bucket & prefix.
+            // It's the most flexible right now, but, possibly we want to move bucket name into PS conf
+            // and force the timeline_id into the prefix?
+            GenericRemoteStorage::AwsS3(Arc::new(
+                remote_storage::S3Bucket::new(
+                    &remote_storage::S3Config {
+                        bucket_name: bucket.clone(),
+                        prefix_in_bucket: Some(key.clone()),
+                        bucket_region: region.clone(),
+                        endpoint: conf
+                            .import_pgdata_aws_endpoint_url
+                            .clone()
+                            .map(|url| url.to_string()), //  by specifying None here, remote_storage/aws-sdk-rust will infer from env
+                        concurrency_limit: 100.try_into().unwrap(), // TODO: think about this
+                        max_keys_per_list_response: Some(1000),     // TODO: think about this
+                        upload_storage_class: None,                 // irrelevant
+                    },
+                    timeout,
+                )
+                .await
+                .context("setup s3 bucket")?,
+            ))
+        }
+    };
+    let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel);
+    Ok(storage_wrapper)
+}
+
+/// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API
+/// such as [`tokio::fs`], which was used in the original implementation of the import code.
+#[derive(Clone)]
+pub struct RemoteStorageWrapper {
+    storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+}
+
+impl RemoteStorageWrapper {
+    pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self {
+        Self { storage, cancel }
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn listfilesindir(
+        &self,
+        path: &RemotePath,
+    ) -> Result<Vec<(RemotePath, usize)>, DownloadError> {
+        assert!(
+            path.object_name().is_some(),
+            "must specify dirname, without trailing slash"
+        );
+        let path = path.add_trailing_slash();
+
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Listing { keys, prefixes: _ } = self
+                    .storage
+                    .list(
+                        Some(&path),
+                        remote_storage::ListingMode::WithDelimiter,
+                        None,
+                        &self.cancel,
+                    )
+                    .await?;
+                let res = keys
+                    .into_iter()
+                    .map(|ListingObject { key, size, .. }| (key, size.into_usize()))
+                    .collect();
+                Ok(res)
+            },
+            &format!("listfilesindir {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(?res, "returning");
+        res
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn listdir(&self, path: &RemotePath) -> Result<Vec<RemotePath>, DownloadError> {
+        assert!(
+            path.object_name().is_some(),
+            "must specify dirname, without trailing slash"
+        );
+        let path = path.add_trailing_slash();
+
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Listing { keys, prefixes } = self
+                    .storage
+                    .list(
+                        Some(&path),
+                        remote_storage::ListingMode::WithDelimiter,
+                        None,
+                        &self.cancel,
+                    )
+                    .await?;
+                let res = keys
+                    .into_iter()
+                    .map(|ListingObject { key, .. }| key)
+                    .chain(prefixes.into_iter())
+                    .collect();
+                Ok(res)
+            },
+            &format!("listdir {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(?res, "returning");
+        res
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn get(&self, path: &RemotePath) -> Result<Bytes, DownloadError> {
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Download {
+                    download_stream, ..
+                } = self
+                    .storage
+                    .download(path, &DownloadOpts::default(), &self.cancel)
+                    .await?;
+                let mut reader = tokio_util::io::StreamReader::new(download_stream);
+
+                // XXX optimize this, can we get the capacity hint from somewhere?
+                let mut buf = Vec::new();
+                tokio::io::copy_buf(&mut reader, &mut buf).await?;
+                Ok(Bytes::from(buf))
+            },
+            &format!("download {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
+        res
+    }
+
+    pub async fn get_spec(&self) -> Result<Option<importbucket_format::Spec>, anyhow::Error> {
+        self.get_json(&RemotePath::from_string("spec.json").unwrap())
+            .await
+            .context("get spec")
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn get_json<T: DeserializeOwned>(
+        &self,
+        path: &RemotePath,
+    ) -> Result<Option<T>, DownloadError> {
+        let buf = match self.get(path).await {
+            Ok(buf) => buf,
+            Err(DownloadError::NotFound) => return Ok(None),
+            Err(err) => return Err(err),
+        };
+        let res = serde_json::from_slice(&buf)
+            .context("serialize")
+            // TODO: own error type
+            .map_err(DownloadError::Other)?;
+        Ok(Some(res))
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn put_json<T>(&self, path: &RemotePath, value: &T) -> anyhow::Result<()>
+    where
+        T: serde::Serialize,
+    {
+        let buf = serde_json::to_vec(value)?;
+        let bytes = Bytes::from(buf);
+        utils::backoff::retry(
+            || async {
+                let size = bytes.len();
+                let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
+                self.storage
+                    .upload_storage_object(bytes, size, path, &self.cancel)
+                    .await
+            },
+            remote_storage::TimeoutOrCancel::caused_by_cancel,
+            1,
+            u32::MAX,
+            &format!("put json {path}"),
+            &self.cancel,
+        )
+        .await
+        .expect("practically infinite retries")
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn get_range(
+        &self,
+        path: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: u64,
+    ) -> Result<Vec<u8>, DownloadError> {
+        let len = end_exclusive
+            .checked_sub(start_inclusive)
+            .unwrap()
+            .into_usize();
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Download {
+                    download_stream, ..
+                } = self
+                    .storage
+                    .download(
+                        path,
+                        &DownloadOpts {
+                            etag: None,
+                            byte_start: Bound::Included(start_inclusive),
+                            byte_end: Bound::Excluded(end_exclusive)
+                        },
+                        &self.cancel)
+                    .await?;
+                let mut reader = tokio_util::io::StreamReader::new(download_stream);
+
+                let mut buf = Vec::with_capacity(len);
+                tokio::io::copy_buf(&mut reader, &mut buf).await?;
+                Ok(buf)
+            },
+            &format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
+        res
+    }
+
+    pub fn pgdata(&self) -> RemotePath {
+        RemotePath::from_string("pgdata").unwrap()
+    }
+
+    pub async fn get_control_file(&self) -> Result<ControlFile, anyhow::Error> {
+        let control_file_path = self.pgdata().join("global/pg_control");
+        info!("get control file from {control_file_path}");
+        let control_file_buf = self.get(&control_file_path).await?;
+        ControlFile::new(control_file_buf)
+    }
+}
+
+pub struct ControlFile {
+    control_file_data: ControlFileData,
+    control_file_buf: Bytes,
+}
+
+impl ControlFile {
+    pub(crate) fn new(control_file_buf: Bytes) -> Result<Self, anyhow::Error> {
+        // XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes.
+        let control_file_data = ControlFileData::decode(&control_file_buf)?;
+        let control_file = ControlFile {
+            control_file_data,
+            control_file_buf,
+        };
+        control_file.try_pg_version()?; // so that we can offer infallible pg_version()
+        Ok(control_file)
+    }
+    pub(crate) fn base_lsn(&self) -> Lsn {
+        Lsn(self.control_file_data.checkPoint).align()
+    }
+    pub(crate) fn pg_version(&self) -> u32 {
+        self.try_pg_version()
+            .expect("prepare() checks that try_pg_version doesn't error")
+    }
+    pub(crate) fn control_file_data(&self) -> &ControlFileData {
+        &self.control_file_data
+    }
+    pub(crate) fn control_file_buf(&self) -> &Bytes {
+        &self.control_file_buf
+    }
+    fn try_pg_version(&self) -> anyhow::Result<u32> {
+        Ok(match self.control_file_data.catalog_version_no {
+            // thesea are from catversion.h
+            202107181 => 14,
+            202209061 => 15,
+            202307071 => 16,
+            /* XXX pg17 */
+            catversion => {
+                anyhow::bail!("unrecognized catalog version {catversion}")
+            }
+        })
+    }
+}
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
new file mode 100644
index 0000000000..04ba3c6f1f
--- /dev/null
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
@@ -0,0 +1,20 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
+pub struct PgdataStatus {
+    pub done: bool,
+    // TODO: remaining fields
+}
+
+#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
+pub struct ShardStatus {
+    pub done: bool,
+    // TODO: remaining fields
+}
+
+// TODO: dedupe with fast_import code
+#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
+pub struct Spec {
+    pub project_id: String,
+    pub branch_id: String,
+}
diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
new file mode 100644
index 0000000000..310d97a6a9
--- /dev/null
+++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
@@ -0,0 +1,68 @@
+use serde::{Deserialize, Serialize};
+
+#[cfg(feature = "testing")]
+use camino::Utf8PathBuf;
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum Root {
+    V1(V1),
+}
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum V1 {
+    InProgress(InProgress),
+    Done(Done),
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+#[serde(transparent)]
+pub struct IdempotencyKey(String);
+
+impl IdempotencyKey {
+    pub fn new(s: String) -> Self {
+        Self(s)
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct InProgress {
+    pub idempotency_key: IdempotencyKey,
+    pub location: Location,
+    pub started_at: chrono::NaiveDateTime,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct Done {
+    pub idempotency_key: IdempotencyKey,
+    pub started_at: chrono::NaiveDateTime,
+    pub finished_at: chrono::NaiveDateTime,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum Location {
+    #[cfg(feature = "testing")]
+    LocalFs { path: Utf8PathBuf },
+    AwsS3 {
+        region: String,
+        bucket: String,
+        key: String,
+    },
+}
+
+impl Root {
+    pub fn is_done(&self) -> bool {
+        match self {
+            Root::V1(v1) => match v1 {
+                V1::Done(_) => true,
+                V1::InProgress(_) => false,
+            },
+        }
+    }
+    pub fn idempotency_key(&self) -> &IdempotencyKey {
+        match self {
+            Root::V1(v1) => match v1 {
+                V1::InProgress(in_progress) => &in_progress.idempotency_key,
+                V1::Done(done) => &done.idempotency_key,
+            },
+        }
+    }
+}
diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
new file mode 100644
index 0000000000..c5210f9a30
--- /dev/null
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -0,0 +1,119 @@
+//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
+use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use tracing::error;
+
+use crate::config::PageServerConf;
+use reqwest::Method;
+
+use super::importbucket_format::Spec;
+
+pub struct Client {
+    base_url: String,
+    authorization_header: Option<String>,
+    client: reqwest::Client,
+    cancel: CancellationToken,
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Serialize, Deserialize, Debug)]
+struct ImportProgressRequest {
+    // no fields yet, not sure if there every will be any
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+struct ImportProgressResponse {
+    // we don't care
+}
+
+impl Client {
+    pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result<Self> {
+        let Some(ref base_url) = conf.import_pgdata_upcall_api else {
+            anyhow::bail!("import_pgdata_upcall_api is not configured")
+        };
+        Ok(Self {
+            base_url: base_url.to_string(),
+            client: reqwest::Client::new(),
+            cancel,
+            authorization_header: conf
+                .import_pgdata_upcall_api_token
+                .as_ref()
+                .map(|secret_string| secret_string.get_contents())
+                .map(|jwt| format!("Bearer {jwt}")),
+        })
+    }
+
+    fn start_request<U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+    ) -> reqwest::RequestBuilder {
+        let req = self.client.request(method, uri);
+        if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        }
+    }
+
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        self.start_request(method, uri)
+            .json(&body)
+            .send()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let res = self.request_noerror(method, uri, body).await?;
+        let response = res.error_from_body().await?;
+        Ok(response)
+    }
+
+    pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> {
+        let url = format!(
+            "{}/projects/{}/branches/{}/import_progress",
+            self.base_url, spec.project_id, spec.branch_id
+        );
+        let ImportProgressResponse {} = self
+            .request(Method::POST, url, &ImportProgressRequest {})
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)?;
+        Ok(())
+    }
+
+    pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> {
+        loop {
+            match self.send_progress_once(spec).await {
+                Ok(()) => return Ok(()),
+                Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")),
+                Err(err) => {
+                    error!(?err, "error sending progress, retrying");
+                    if tokio::time::timeout(
+                        std::time::Duration::from_secs(10),
+                        self.cancel.cancelled(),
+                    )
+                    .await
+                    .is_ok()
+                    {
+                        anyhow::bail!("cancelled while sending early progress update");
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index a93bdde3f8..80a09b4840 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -3,7 +3,7 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use tracing::{error, info, info_span};
-use utils::{fs_ext, id::TimelineId, lsn::Lsn};
+use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
 
 use crate::{
     context::RequestContext,
@@ -23,14 +23,14 @@ use super::Timeline;
 pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
 }
 
 impl<'t> UninitializedTimeline<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
     ) -> Self {
         Self {
             owning_tenant,
@@ -87,6 +87,10 @@ impl<'t> UninitializedTimeline<'t> {
         }
     }
 
+    pub(crate) fn finish_creation_myself(&mut self) -> (Arc<Timeline>, TimelineCreateGuard) {
+        self.raw_timeline.take().expect("already checked")
+    }
+
     /// Prepares timeline data by loading it from the basebackup archive.
     pub(crate) async fn import_basebackup_from_tar(
         self,
@@ -167,9 +171,10 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
 /// A guard for timeline creations in process: as long as this object exists, the timeline ID
 /// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
-pub(crate) struct TimelineCreateGuard<'t> {
-    owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
+pub(crate) struct TimelineCreateGuard {
+    pub(crate) _tenant_gate_guard: GateGuard,
+    pub(crate) owning_tenant: Arc<Tenant>,
+    pub(crate) timeline_id: TimelineId,
     pub(crate) timeline_path: Utf8PathBuf,
     pub(crate) idempotency: CreateTimelineIdempotency,
 }
@@ -184,20 +189,27 @@ pub(crate) enum TimelineExclusionError {
     },
     #[error("Already creating")]
     AlreadyCreating,
+    #[error("Shutting down")]
+    ShuttingDown,
 
     // e.g. I/O errors, or some failure deep in postgres initdb
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
-impl<'t> TimelineCreateGuard<'t> {
+impl TimelineCreateGuard {
     pub(crate) fn new(
-        owning_tenant: &'t Tenant,
+        owning_tenant: &Arc<Tenant>,
         timeline_id: TimelineId,
         timeline_path: Utf8PathBuf,
         idempotency: CreateTimelineIdempotency,
         allow_offloaded: bool,
     ) -> Result<Self, TimelineExclusionError> {
+        let _tenant_gate_guard = owning_tenant
+            .gate
+            .enter()
+            .map_err(|_| TimelineExclusionError::ShuttingDown)?;
+
         // Lock order: this is the only place we take both locks.  During drop() we only
         // lock creating_timelines
         let timelines = owning_tenant.timelines.lock().unwrap();
@@ -225,8 +237,12 @@ impl<'t> TimelineCreateGuard<'t> {
             return Err(TimelineExclusionError::AlreadyCreating);
         }
         creating_timelines.insert(timeline_id);
+        drop(creating_timelines);
+        drop(timelines_offloaded);
+        drop(timelines);
         Ok(Self {
-            owning_tenant,
+            _tenant_gate_guard,
+            owning_tenant: Arc::clone(owning_tenant),
             timeline_id,
             timeline_path,
             idempotency,
@@ -234,7 +250,7 @@ impl<'t> TimelineCreateGuard<'t> {
     }
 }
 
-impl Drop for TimelineCreateGuard<'_> {
+impl Drop for TimelineCreateGuard {
     fn drop(&mut self) {
         self.owning_tenant
             .timelines_creating
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index c73d5411fa..6c22b31e00 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -190,6 +190,25 @@ class TenantTimelineId:
         )
 
 
+@dataclass
+class ShardIndex:
+    shard_number: int
+    shard_count: int
+
+    # cf impl Display for ShardIndex
+    @override
+    def __str__(self) -> str:
+        return f"{self.shard_number:02x}{self.shard_count:02x}"
+
+    @classmethod
+    def parse(cls: type[ShardIndex], input: str) -> ShardIndex:
+        assert len(input) == 4
+        return cls(
+            shard_number=int(input[0:2], 16),
+            shard_count=int(input[2:4], 16),
+        )
+
+
 class TenantShardId:
     def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
         self.tenant_id = tenant_id
@@ -222,6 +241,10 @@ class TenantShardId:
             # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id)
             return str(self.tenant_id)
 
+    @property
+    def shard_index(self) -> ShardIndex:
+        return ShardIndex(self.shard_number, self.shard_count)
+
     @override
     def __repr__(self):
         return self.__str__()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d8d2b87b4e..78e2422171 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1883,6 +1883,20 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         log.info(f"tenant_create success: {response.json()}")
 
+    def timeline_create(
+        self,
+        tenant_id: TenantId,
+        body: dict[str, Any],
+    ):
+        response = self.request(
+            "POST",
+            f"{self.api}/v1/tenant/{tenant_id}/timeline",
+            json=body,
+            headers=self.headers(TokenScope.PAGE_SERVER_API),
+        )
+        response.raise_for_status()
+        log.info(f"timeline_create success: {response.json()}")
+
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
         """
         :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 56386fdd37..4cf3ece396 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+import dataclasses
+import json
+import random
+import string
 import time
 from collections import defaultdict
 from dataclasses import dataclass
@@ -10,7 +14,14 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
+from fixtures.common_types import (
+    Id,
+    Lsn,
+    TenantId,
+    TenantShardId,
+    TimelineArchivalState,
+    TimelineId,
+)
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
@@ -24,6 +35,69 @@ class PageserverApiException(Exception):
         self.status_code = status_code
 
 
+@dataclass
+class ImportPgdataIdemptencyKey:
+    key: str
+
+    @staticmethod
+    def random() -> ImportPgdataIdemptencyKey:
+        return ImportPgdataIdemptencyKey(
+            "".join(random.choices(string.ascii_letters + string.digits, k=20))
+        )
+
+
+@dataclass
+class LocalFs:
+    path: str
+
+
+@dataclass
+class AwsS3:
+    region: str
+    bucket: str
+    key: str
+
+
+@dataclass
+class ImportPgdataLocation:
+    LocalFs: None | LocalFs = None
+    AwsS3: None | AwsS3 = None
+
+
+@dataclass
+class TimelineCreateRequestModeImportPgdata:
+    location: ImportPgdataLocation
+    idempotency_key: ImportPgdataIdemptencyKey
+
+
+@dataclass
+class TimelineCreateRequestMode:
+    Branch: None | dict[str, Any] = None
+    Bootstrap: None | dict[str, Any] = None
+    ImportPgdata: None | TimelineCreateRequestModeImportPgdata = None
+
+
+@dataclass
+class TimelineCreateRequest:
+    new_timeline_id: TimelineId
+    mode: TimelineCreateRequestMode
+
+    def to_json(self) -> str:
+        class EnhancedJSONEncoder(json.JSONEncoder):
+            def default(self, o):
+                if dataclasses.is_dataclass(o) and not isinstance(o, type):
+                    return dataclasses.asdict(o)
+                elif isinstance(o, Id):
+                    return o.id.hex()
+                return super().default(o)
+
+        # mode is flattened
+        this = dataclasses.asdict(self)
+        mode = this.pop("mode")
+        this.update(mode)
+        return json.dumps(self, cls=EnhancedJSONEncoder)
+
+
 class TimelineCreate406(PageserverApiException):
     def __init__(self, res: requests.Response):
         assert res.status_code == 406
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 010801be6c..30720e648d 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -674,6 +674,13 @@ def run_only_on_default_postgres(reason: str):
     )
 
 
+def run_only_on_postgres(versions: Iterable[PgVersion], reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) not in versions,
+        reason=reason,
+    )
+
+
 def skip_in_debug_build(reason: str):
     return pytest.mark.skipif(
         os.getenv("BUILD_TYPE", "debug") == "debug",
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
new file mode 100644
index 0000000000..29229b73c1
--- /dev/null
+++ b/test_runner/regress/test_import_pgdata.py
@@ -0,0 +1,307 @@
+import json
+import re
+import time
+from enum import Enum
+
+import psycopg2
+import psycopg2.errors
+import pytest
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres
+from fixtures.pageserver.http import (
+    ImportPgdataIdemptencyKey,
+    PageserverApiException,
+)
+from fixtures.pg_version import PgVersion
+from fixtures.remote_storage import RemoteStorageKind
+from fixtures.utils import run_only_on_postgres
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+num_rows = 1000
+
+
+class RelBlockSize(Enum):
+    ONE_STRIPE_SIZE = 1
+    TWO_STRPES_PER_SHARD = 2
+    MULTIPLE_RELATION_SEGMENTS = 3
+
+
+smoke_params = [
+    # unsharded (the stripe size needs to be given for rel block size calculations)
+    *[(None, 1024, s) for s in RelBlockSize],
+    # many shards, small stripe size to speed up test
+    *[(8, 1024, s) for s in RelBlockSize],
+]
+
+
+@run_only_on_postgres(
+    [PgVersion.V14, PgVersion.V15, PgVersion.V16],
+    "newer control file catalog version and struct format isn't supported",
+)
+@pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params)
+def test_pgdata_import_smoke(
+    vanilla_pg: VanillaPostgres,
+    neon_env_builder: NeonEnvBuilder,
+    shard_count: int | None,
+    stripe_size: int,
+    rel_block_size: RelBlockSize,
+    make_httpserver: HTTPServer,
+):
+    #
+    # Setup fake control plane for import progress
+    #
+    def handler(request: Request) -> Response:
+        log.info(f"control plane request: {request.json}")
+        return Response(json.dumps({}), status=200)
+
+    cplane_mgmt_api_server = make_httpserver
+    cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler)
+
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api"
+        }
+    )
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    #
+    # Put data in vanilla pg
+    #
+
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+
+    log.info("create relblock data")
+    if rel_block_size == RelBlockSize.ONE_STRIPE_SIZE:
+        target_relblock_size = stripe_size * 8192
+    elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD:
+        target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2
+    elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS:
+        target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192)
+    else:
+        raise ValueError
+
+    # fillfactor so we don't need to produce that much data
+    # 900 byte per row is > 10% => 1 row per page
+    vanilla_pg.safe_psql("""create table t (data char(900)) with (fillfactor = 10)""")
+
+    nrows = 0
+    while True:
+        relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')")
+        log.info(
+            f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages"
+        )
+        if relblock_size >= target_relblock_size:
+            break
+        addrows = int((target_relblock_size - relblock_size) // 8192)
+        assert addrows >= 1, "forward progress"
+        vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})")
+        nrows += addrows
+    expect_nrows = nrows
+    expect_sum = (
+        (nrows) * (nrows + 1) // 2
+    )  # https://stackoverflow.com/questions/43901484/sum-of-the-integers-from-1-to-n
+
+    def validate_vanilla_equivalence(ep):
+        # TODO: would be nicer to just compare pgdump
+        assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [
+            (expect_nrows, expect_sum)
+        ]
+
+    validate_vanilla_equivalence(vanilla_pg)
+
+    vanilla_pg.stop()
+
+    #
+    # We have a Postgres data directory now.
+    # Make a localfs remote storage that looks like how after `fast_import` ran.
+    # TODO: actually exercise fast_import here
+    # TODO: test s3 remote storage
+    #
+    importbucket = neon_env_builder.repo_dir / "importbucket"
+    importbucket.mkdir()
+    # what cplane writes before scheduling fast_import
+    specpath = importbucket / "spec.json"
+    specpath.write_text(json.dumps({"branch_id": "somebranch", "project_id": "someproject"}))
+    # what fast_import writes
+    vanilla_pg.pgdatadir.rename(importbucket / "pgdata")
+    statusdir = importbucket / "status"
+    statusdir.mkdir()
+    (statusdir / "pgdata").write_text(json.dumps({"done": True}))
+
+    #
+    # Do the import
+    #
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id, shard_count=shard_count, shard_stripe_size=stripe_size
+    )
+
+    timeline_id = TimelineId.generate()
+    log.info("starting import")
+    start = time.monotonic()
+
+    idempotency = ImportPgdataIdemptencyKey.random()
+    log.info(f"idempotency key {idempotency}")
+    # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop
+    # and check for 429
+
+    import_branch_name = "imported"
+    env.storage_controller.timeline_create(
+        tenant_id,
+        {
+            "new_timeline_id": str(timeline_id),
+            "import_pgdata": {
+                "idempotency_key": str(idempotency),
+                "location": {"LocalFs": {"path": str(importbucket.absolute())}},
+            },
+        },
+    )
+    env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id)
+
+    while True:
+        locations = env.storage_controller.locate(tenant_id)
+        active_count = 0
+        for location in locations:
+            shard_id = TenantShardId.parse(location["shard_id"])
+            ps = env.get_pageserver(location["node_id"])
+            try:
+                detail = ps.http_client().timeline_detail(shard_id, timeline_id)
+                state = detail["state"]
+                log.info(f"shard {shard_id} state: {state}")
+                if state == "Active":
+                    active_count += 1
+            except PageserverApiException as e:
+                if e.status_code == 404:
+                    log.info("not found, import is in progress")
+                    continue
+                elif e.status_code == 429:
+                    log.info("import is in progress")
+                    continue
+                else:
+                    raise
+
+            shard_status_file = statusdir / f"shard-{shard_id.shard_index}"
+            if state == "Active":
+                shard_status_file_contents = (
+                    shard_status_file.read_text()
+                )  # Active state implies import is done
+                shard_status = json.loads(shard_status_file_contents)
+                assert shard_status["done"] is True
+
+        if active_count == len(locations):
+            log.info("all shards are active")
+            break
+        time.sleep(1)
+
+    import_duration = time.monotonic() - start
+    log.info(f"import complete; duration={import_duration:.2f}s")
+
+    #
+    # Get some timeline details for later.
+    #
+    locations = env.storage_controller.locate(tenant_id)
+    [shard_zero] = [
+        loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0
+    ]
+    shard_zero_ps = env.get_pageserver(shard_zero["node_id"])
+    shard_zero_http = shard_zero_ps.http_client()
+    shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id)
+    initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"])
+    latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"])
+    last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"])
+    disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"])
+    _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"])
+    remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"])
+    # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None`
+    assert remote_consistent_lsn_visible == disk_consistent_lsn
+    assert initdb_lsn == latest_gc_cutoff_lsn
+    assert disk_consistent_lsn == initdb_lsn + 8
+    assert last_record_lsn == disk_consistent_lsn
+    # TODO: assert these values are the same everywhere
+
+    #
+    # Validate the resulting remote storage state.
+    #
+
+    #
+    # Validate the imported data
+    #
+
+    ro_endpoint = env.endpoints.create_start(
+        branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn
+    )
+
+    validate_vanilla_equivalence(ro_endpoint)
+
+    # ensure the import survives restarts
+    ro_endpoint.stop()
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()
+    ro_endpoint.start()
+    validate_vanilla_equivalence(ro_endpoint)
+
+    #
+    # validate the layer files in each shard only have the shard-specific data
+    # (the implementation would be functional but not efficient without this characteristic)
+    #
+
+    shards = env.storage_controller.locate(tenant_id)
+    for shard in shards:
+        shard_ps = env.get_pageserver(shard["node_id"])
+        result = shard_ps.timeline_scan_no_disposable_keys(shard["shard_id"], timeline_id)
+        assert result.tally.disposable_count == 0
+        assert (
+            result.tally.not_disposable_count > 0
+        ), "sanity check, each shard should have some data"
+
+    #
+    # validate that we can write
+    #
+    rw_endpoint = env.endpoints.create_start(
+        branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id
+    )
+    rw_endpoint.safe_psql("create table othertable(values text)")
+    rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
+
+    # TODO: consider using `class Workload` here
+    # to do compaction and whatnot?
+
+    #
+    # validate that we can branch (important use case)
+    #
+
+    # ... at the tip
+    _ = env.create_branch(
+        new_branch_name="br-tip",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=rw_lsn,
+    )
+    br_tip_endpoint = env.endpoints.create_start(
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_tip_endpoint)
+    br_tip_endpoint.safe_psql("select * from othertable")
+
+    # ... at the initdb lsn
+    _ = env.create_branch(
+        new_branch_name="br-initdb",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=initdb_lsn,
+    )
+    br_initdb_endpoint = env.endpoints.create_start(
+        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_initdb_endpoint)
+    with pytest.raises(psycopg2.errors.UndefinedTable):
+        br_initdb_endpoint.safe_psql("select * from othertable")
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 667d54df02..a73d9d6352 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -19,7 +19,8 @@ ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 axum = { version = "0.7", features = ["ws"] }
 axum-core = { version = "0.4", default-features = false, features = ["tracing"] }
-base64 = { version = "0.21", features = ["alloc"] }
+base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] }
+base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
 camino = { version = "1", default-features = false, features = ["serde1"] }

From 6f7aeaa1c563906fe14a4f62870d9fcd62436e02 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 25 Nov 2024 09:01:05 +0000
Subject: [PATCH 41/51] test_runner: use LFC by default (#8613)

## Problem
LFC is not enabled by default in tests, but it is enabled in production.
This increases the risk of errors in the production environment, which
were not found during the routine workflow.
However, enabling LFC for all the tests may overload the disk on our
servers and increase the number of failures.
So, we try enabling  LFC in one case to evaluate the possible risk.

## Summary of changes
A new environment variable, USE_LFC is introduced. If it is set to true,
LFC is enabled by default in all the tests.
In our workflow, we enable LFC for PG17, release, x86-64, and disabled
for all other combinations.

---------

Co-authored-by: Alexey Masterov <alexeymasterov@neon.tech>
Co-authored-by: a-masterov <72613290+a-masterov@users.noreply.github.com>
---
 .github/workflows/_build-and-test-locally.yml |  9 +-
 .github/workflows/build_and_test.yml          |  9 +-
 .../ingest_regress_test_result-new-format.py  |  4 +
 test_runner/fixtures/neon_fixtures.py         | 82 +++++++++++++++++--
 test_runner/fixtures/parametrize.py           |  1 +
 test_runner/fixtures/utils.py                 | 21 +++++
 test_runner/fixtures/workload.py              |  2 +-
 test_runner/regress/test_combocid.py          | 18 +---
 .../regress/test_explain_with_lfc_stats.py    |  5 +-
 test_runner/regress/test_hot_standby.py       |  2 +-
 test_runner/regress/test_lfc_resize.py        | 15 ++--
 .../test_lfc_working_set_approximation.py     |  9 +-
 test_runner/regress/test_local_file_cache.py  |  6 +-
 .../regress/test_logical_replication.py       | 12 ++-
 test_runner/regress/test_oid_overflow.py      |  2 +-
 test_runner/regress/test_read_validation.py   |  2 +-
 test_runner/regress/test_readonly_node.py     |  2 +-
 .../regress/test_timeline_detach_ancestor.py  |  2 +-
 test_runner/regress/test_vm_bits.py           |  2 +-
 test_runner/regress/test_wal_acceptor.py      |  2 +-
 20 files changed, 158 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 8e28049888..bdf7c07c6a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,8 +19,8 @@ on:
         description: 'debug or release'
         required: true
         type: string
-      pg-versions:
-        description: 'a json array of postgres versions to run regression tests on'
+      test-cfg:
+        description: 'a json object of postgres versions and lfc states to run regression tests on'
         required: true
         type: string
 
@@ -276,14 +276,14 @@ jobs:
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
       fail-fast: false
-      matrix:
-        pg_version: ${{ fromJson(inputs.pg-versions) }}
+      matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: true
 
       - name: Pytest regression tests
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
         uses: ./.github/actions/run-python-test-set
         timeout-minutes: 60
         with:
@@ -300,6 +300,7 @@ jobs:
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ inputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 89fd2d0d17..9830c2a0c9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -253,7 +253,14 @@ jobs:
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
       # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
+      # run without LFC on v17 release only
+      test-cfg: |
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index c99cfa2b01..064c516718 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -31,6 +31,7 @@ CREATE TABLE IF NOT EXISTS results (
     duration     INT NOT NULL,
     flaky        BOOLEAN NOT NULL,
     arch         arch DEFAULT 'X64',
+    lfc          BOOLEAN DEFAULT false NOT NULL,
     build_type   TEXT NOT NULL,
     pg_version   INT NOT NULL,
     run_id       BIGINT NOT NULL,
@@ -54,6 +55,7 @@ class Row:
     duration: int
     flaky: bool
     arch: str
+    lfc: bool
     build_type: str
     pg_version: int
     run_id: int
@@ -132,6 +134,7 @@ def ingest_test_result(
             if p["name"].startswith("__")
         }
         arch = parameters.get("arch", "UNKNOWN").strip("'")
+        lfc = parameters.get("lfc", "False") == "True"
 
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
@@ -145,6 +148,7 @@ def ingest_test_result(
             duration=test["time"]["duration"],
             flaky=test["flaky"] or test["retriesStatusChange"],
             arch=arch,
+            lfc=lfc,
             build_type=build_type,
             pg_version=pg_version,
             run_id=run_id,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 78e2422171..07d442b4a6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -90,10 +90,12 @@ from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     COMPONENT_BINARIES,
+    USE_LFC,
     allure_add_grafana_links,
     assert_no_errors,
     get_dir_size,
     print_gc_result,
+    size_to_bytes,
     subprocess_capture,
     wait_until,
 )
@@ -3742,12 +3744,45 @@ class Endpoint(PgProtocol, LogUtils):
         self.pgdata_dir = self.env.repo_dir / path
         self.logfile = self.endpoint_path() / "compute.log"
 
-        config_lines = config_lines or []
-
         # set small 'max_replication_write_lag' to enable backpressure
         # and make tests more stable.
         config_lines = ["max_replication_write_lag=15MB"] + config_lines
 
+        # Delete file cache if it exists (and we're recreating the endpoint)
+        if USE_LFC:
+            if (lfc_path := Path(self.lfc_path())).exists():
+                lfc_path.unlink()
+            else:
+                lfc_path.parent.mkdir(parents=True, exist_ok=True)
+            for line in config_lines:
+                if (
+                    line.find("neon.max_file_cache_size") > -1
+                    or line.find("neon.file_cache_size_limit") > -1
+                ):
+                    m = re.search(r"=\s*(\S+)", line)
+                    assert m is not None, f"malformed config line {line}"
+                    size = m.group(1)
+                    assert size_to_bytes(size) >= size_to_bytes(
+                        "1MB"
+                    ), "LFC size cannot be set less than 1MB"
+            # shared_buffers = 512kB to make postgres use LFC intensively
+            # neon.max_file_cache_size and neon.file_cache size limit are
+            # set to 1MB because small LFC is better for testing (helps to find more problems)
+            config_lines = [
+                "shared_buffers = 512kB",
+                f"neon.file_cache_path = '{self.lfc_path()}'",
+                "neon.max_file_cache_size = 1MB",
+                "neon.file_cache_size_limit = 1MB",
+            ] + config_lines
+        else:
+            for line in config_lines:
+                assert (
+                    line.find("neon.max_file_cache_size") == -1
+                ), "Setting LFC parameters is not allowed when LFC is disabled"
+                assert (
+                    line.find("neon.file_cache_size_limit") == -1
+                ), "Setting LFC parameters is not allowed when LFC is disabled"
+
         self.config(config_lines)
 
         return self
@@ -3781,6 +3816,9 @@ class Endpoint(PgProtocol, LogUtils):
             basebackup_request_tries=basebackup_request_tries,
         )
         self._running.release(1)
+        self.log_config_value("shared_buffers")
+        self.log_config_value("neon.max_file_cache_size")
+        self.log_config_value("neon.file_cache_size_limit")
 
         return self
 
@@ -3806,6 +3844,10 @@ class Endpoint(PgProtocol, LogUtils):
         """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
         return self.endpoint_path() / "postgresql.conf"
 
+    def lfc_path(self) -> Path:
+        """Path to the lfc file"""
+        return self.endpoint_path() / "file_cache" / "file.cache"
+
     def config(self, lines: list[str]) -> Self:
         """
         Add lines to postgresql.conf.
@@ -3984,16 +4026,46 @@ class Endpoint(PgProtocol, LogUtils):
         assert self.pgdata_dir is not None  # please mypy
         return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
 
-    def clear_shared_buffers(self, cursor: Any | None = None):
+    def clear_buffers(self, cursor: Any | None = None):
         """
         Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.'
-
-        Might also clear LFC.
+        It clears LFC as well by setting neon.file_cache_size_limit to 0 and then returning it to the previous value,
+        if LFC is enabled
         """
         if cursor is not None:
             cursor.execute("select clear_buffer_cache()")
+            if not USE_LFC:
+                return
+            cursor.execute("SHOW neon.file_cache_size_limit")
+            res = cursor.fetchone()
+            assert res, "Cannot get neon.file_cache_size_limit"
+            file_cache_size_limit = res[0]
+            if file_cache_size_limit == 0:
+                return
+            cursor.execute("ALTER SYSTEM SET neon.file_cache_size_limit=0")
+            cursor.execute("SELECT pg_reload_conf()")
+            cursor.execute(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
+            cursor.execute("SELECT pg_reload_conf()")
         else:
             self.safe_psql("select clear_buffer_cache()")
+            if not USE_LFC:
+                return
+            file_cache_size_limit = self.safe_psql_scalar(
+                "SHOW neon.file_cache_size_limit", log_query=False
+            )
+            if file_cache_size_limit == 0:
+                return
+            self.safe_psql("ALTER SYSTEM SET neon.file_cache_size_limit=0")
+            self.safe_psql("SELECT pg_reload_conf()")
+            self.safe_psql(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
+            self.safe_psql("SELECT pg_reload_conf()")
+
+    def log_config_value(self, param):
+        """
+        Writes the config value param to log
+        """
+        res = self.safe_psql_scalar(f"SHOW {param}", log_query=False)
+        log.info("%s = %s", param, res)
 
 
 class EndpointFactory:
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 2c6adb8a33..f57c0f801f 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -116,5 +116,6 @@ def pytest_runtest_makereport(*args, **kwargs):
     }.get(os.uname().machine, "UNKNOWN")
     arch = os.getenv("RUNNER_ARCH", uname_m)
     allure.dynamic.parameter("__arch", arch)
+    allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false")
 
     yield
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 30720e648d..04e98fe494 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -57,6 +57,10 @@ VERSIONS_COMBINATIONS = (
 )
 # fmt: on
 
+# If the environment variable USE_LFC is set and its value is "false", then LFC is disabled for tests.
+# If it is not set or set to a value not equal to "false", LFC is enabled by default.
+USE_LFC = os.environ.get("USE_LFC") != "false"
+
 
 def subprocess_capture(
     capture_dir: Path,
@@ -653,6 +657,23 @@ def allpairs_versions():
     return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}
 
 
+def size_to_bytes(hr_size: str) -> int:
+    """
+    Gets human-readable size from postgresql.conf (e.g. 512kB, 10MB)
+    returns size in bytes
+    """
+    units = {"B": 1, "kB": 1024, "MB": 1024**2, "GB": 1024**3, "TB": 1024**4, "PB": 1024**5}
+    match = re.search(r"^\'?(\d+)\s*([kMGTP]?B)?\'?$", hr_size)
+    assert match is not None, f'"{hr_size}" is not a well-formatted human-readable size'
+    number, unit = match.groups()
+
+    if unit:
+        amp = units[unit]
+    else:
+        amp = 8192
+    return int(number) * amp
+
+
 def skip_on_postgres(version: PgVersion, reason: str):
     return pytest.mark.skipif(
         PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 4c6b2b6b3e..1b8c9fef44 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -193,7 +193,7 @@ class Workload:
 
     def validate(self, pageserver_id: int | None = None):
         endpoint = self.endpoint(pageserver_id)
-        endpoint.clear_shared_buffers()
+        endpoint.clear_buffers()
         result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}")
 
         log.info(f"validate({self.expect_rows}): {result}")
diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py
index 57d5b2d8b3..2db16d9f64 100644
--- a/test_runner/regress/test_combocid.py
+++ b/test_runner/regress/test_combocid.py
@@ -5,12 +5,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
     env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            "shared_buffers='1MB'",
-        ],
-    )
+    endpoint = env.endpoints.create_start("main")
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -36,7 +31,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
 
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    endpoint.clear_shared_buffers()
+    endpoint.clear_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -65,12 +60,7 @@ def test_combocid_lock(neon_env_builder: NeonEnvBuilder):
 
 def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            "shared_buffers='1MB'",
-        ],
-    )
+    endpoint = env.endpoints.create_start("main")
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -98,7 +88,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
     cur.execute("delete from t")
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    endpoint.clear_shared_buffers()
+    endpoint.clear_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py
index 2128bd93dd..382556fd7e 100644
--- a/test_runner/regress/test_explain_with_lfc_stats.py
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -2,10 +2,13 @@ from __future__ import annotations
 
 from pathlib import Path
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import USE_LFC
 
 
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
@@ -16,8 +19,6 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create_start(
         "main",
         config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
             "neon.max_file_cache_size='128MB'",
             "neon.file_cache_size_limit='64MB'",
         ],
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index a906e7a243..0b1ac11c16 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -170,7 +170,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
             # re-execute the query, it will make GetPage
             # requests. This does not clear the last-written LSN cache
             # so we still remember the LSNs of the pages.
-            secondary.clear_shared_buffers(cursor=s_cur)
+            secondary.clear_buffers(cursor=s_cur)
 
             if pause_apply:
                 s_cur.execute("SELECT pg_wal_replay_pause()")
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 3083128d87..377b0fb4d4 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 import random
 import re
 import subprocess
@@ -10,20 +9,24 @@ import time
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, PgBin
+from fixtures.utils import USE_LFC
 
 
 @pytest.mark.timeout(600)
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     """
     Test resizing the Local File Cache
     """
     env = neon_simple_env
+    cache_dir = env.repo_dir / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+    env.create_branch("test_lfc_resize")
     endpoint = env.endpoints.create_start(
         "main",
         config_lines=[
-            "neon.file_cache_path='file.cache'",
-            "neon.max_file_cache_size=512MB",
-            "neon.file_cache_size_limit=512MB",
+            "neon.max_file_cache_size=1GB",
+            "neon.file_cache_size_limit=1GB",
         ],
     )
     n_resize = 10
@@ -63,8 +66,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     cur.execute("select pg_reload_conf()")
     nretries = 10
     while True:
-        lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
-        lfc_file_size = os.path.getsize(lfc_file_path)
+        lfc_file_path = endpoint.lfc_path()
+        lfc_file_size = lfc_file_path.stat().st_size
         res = subprocess.run(
             ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
         )
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index 36dfec969f..17068849d4 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -3,11 +3,13 @@ from __future__ import annotations
 import time
 from pathlib import Path
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.utils import query_scalar
+from fixtures.utils import USE_LFC, query_scalar
 
 
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
@@ -18,8 +20,6 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create_start(
         "main",
         config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
             "neon.max_file_cache_size='128MB'",
             "neon.file_cache_size_limit='64MB'",
         ],
@@ -72,9 +72,10 @@ WITH (fillfactor='100');
     # verify working set size after some index access of a few select pages only
     blocks = query_scalar(cur, "select approximate_working_set_size(true)")
     log.info(f"working set size after some index access of a few select pages only {blocks}")
-    assert blocks < 10
+    assert blocks < 12
 
 
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index fbf018a167..94c630ffcf 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -6,10 +6,12 @@ import random
 import threading
 import time
 
+import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.utils import query_scalar
+from fixtures.utils import USE_LFC, query_scalar
 
 
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
@@ -19,8 +21,6 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
     endpoint = env.endpoints.create_start(
         "main",
         config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
             "neon.max_file_cache_size='64MB'",
             "neon.file_cache_size_limit='10MB'",
         ],
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index df83ca1c44..ba471b7147 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -12,7 +12,7 @@ from fixtures.neon_fixtures import (
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
-from fixtures.utils import wait_until
+from fixtures.utils import USE_LFC, wait_until
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import (
@@ -576,7 +576,15 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
     # We want all data to fit into shared_buffers because later we stop
     # safekeeper and insert more; this shouldn't cause page requests as they
     # will be stuck.
-    sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"])
+    sub = env.endpoints.create(
+        "subscriber",
+        config_lines=[
+            "neon.max_file_cache_size = 32MB",
+            "neon.file_cache_size_limit = 32MB",
+        ]
+        if USE_LFC
+        else [],
+    )
     sub.start()
 
     with vanilla_pg.cursor() as pcur:
diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py
index f69c1112c7..e2bde8be6f 100644
--- a/test_runner/regress/test_oid_overflow.py
+++ b/test_runner/regress/test_oid_overflow.py
@@ -39,7 +39,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder):
     oid = cur.fetchall()[0][0]
     log.info(f"t2.relfilenode={oid}")
 
-    endpoint.clear_shared_buffers(cursor=cur)
+    endpoint.clear_buffers(cursor=cur)
 
     cur.execute("SELECT x from t1")
     assert cur.fetchone() == (1,)
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 471a3b406a..70a7a675df 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -54,7 +54,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
             log.info("Clear buffer cache to ensure no stale pages are brought into the cache")
 
-            endpoint.clear_shared_buffers(cursor=c)
+            endpoint.clear_buffers(cursor=c)
 
             cache_entries = query_scalar(
                 c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index fcebf8d23a..70d558ac5a 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -230,7 +230,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
         return offset
 
     # Insert some records on main branch
-    with env.endpoints.create_start("main") as ep_main:
+    with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main:
         with ep_main.cursor() as cur:
             cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
         lsn = Lsn(0)
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index cd4e0a5f3b..9c7e851ba8 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -416,7 +416,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn
 
     assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None
 
-    ep.clear_shared_buffers()
+    ep.clear_buffers()
     assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
     assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
     ep.stop()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index d4c2ca7e07..f93fc6bd8b 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -63,7 +63,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
 
     # Clear the buffer cache, to force the VM page to be re-fetched from
     # the page server
-    endpoint.clear_shared_buffers(cursor=cur)
+    endpoint.clear_buffers(cursor=cur)
 
     # Check that an index-only scan doesn't see the deleted row. If the
     # clearing of the VM bit was not replayed correctly, this would incorrectly
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 405f15e488..8fa33b81a9 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2446,7 +2446,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
         # generate some data to commit WAL on safekeepers
         endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
         # clear the buffers
-        endpoint.clear_shared_buffers()
+        endpoint.clear_buffers()
         # read data to fetch pages from pageserver
         endpoint.safe_psql("select sum(i) from t")
 

From 0d1e82f0a7112478d3681bbba1035cbbe2f4b408 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 25 Nov 2024 11:59:49 +0100
Subject: [PATCH 42/51] Bump futures-* crates, drop unused license, hide
 duplicate crate warnings (#9858)

* The futures-util crate we use was yanked. Bump it and its siblings to
new patch release.
https://github.com/rust-lang/futures-rs/releases/tag/0.3.31
* cargo-deny: Drop an unused license.
* cargo-deny: Don't warn about duplicate crate. Duplicate crates are
unavoidable and the noise just hides real warnings.
---
 Cargo.lock | 28 ++++++++++++++--------------
 deny.toml  |  3 +--
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 665aa4aecc..5be8b97815 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2174,9 +2174,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -2184,9 +2184,9 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
 
 [[package]]
 name = "futures-executor"
@@ -2201,9 +2201,9 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
 
 [[package]]
 name = "futures-lite"
@@ -2222,9 +2222,9 @@ dependencies = [
 
 [[package]]
 name = "futures-macro"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2233,15 +2233,15 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
 
 [[package]]
 name = "futures-task"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
 
 [[package]]
 name = "futures-timer"
@@ -2251,9 +2251,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
 
 [[package]]
 name = "futures-util"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
 dependencies = [
  "futures-channel",
  "futures-core",
diff --git a/deny.toml b/deny.toml
index 8bf643f4ba..7a1eecac99 100644
--- a/deny.toml
+++ b/deny.toml
@@ -33,7 +33,6 @@ reason = "the marvin attack only affects private key decryption, not public key
 [licenses]
 allow = [
     "Apache-2.0",
-    "Artistic-2.0",
     "BSD-2-Clause",
     "BSD-3-Clause",
     "CC0-1.0",
@@ -67,7 +66,7 @@ registries = []
 # More documentation about the 'bans' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
 [bans]
-multiple-versions = "warn"
+multiple-versions = "allow"
 wildcards = "allow"
 highlight = "all"
 workspace-default-features = "allow"

From 6f6749c4a96673430fa6ecce26ca756ead7e8a46 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 25 Nov 2024 12:01:30 +0000
Subject: [PATCH 43/51] chore: update rustls (#9871)

---
 Cargo.lock | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5be8b97815..98d2e0864a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4182,7 +4182,7 @@ dependencies = [
  "bytes",
  "once_cell",
  "pq_proto",
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
@@ -4518,7 +4518,7 @@ dependencies = [
  "rsa",
  "rstest",
  "rustc-hash",
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
@@ -5231,9 +5231,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.23.16"
+version = "0.23.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
+checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f"
 dependencies = [
  "log",
  "once_cell",
@@ -5948,7 +5948,7 @@ dependencies = [
  "once_cell",
  "parking_lot 0.12.1",
  "prost",
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "tokio",
  "tonic",
  "tonic-build",
@@ -6031,7 +6031,7 @@ dependencies = [
  "postgres_ffi",
  "remote_storage",
  "reqwest 0.12.4",
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "rustls-native-certs 0.8.0",
  "serde",
  "serde_json",
@@ -6493,7 +6493,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
  "ring",
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "tokio",
  "tokio-postgres",
  "tokio-rustls 0.26.0",
@@ -6527,7 +6527,7 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
 dependencies = [
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "rustls-pki-types",
  "tokio",
 ]
@@ -6936,7 +6936,7 @@ dependencies = [
  "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "rustls-pki-types",
  "url",
  "webpki-roots 0.26.1",
@@ -7598,7 +7598,7 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest 0.12.4",
- "rustls 0.23.16",
+ "rustls 0.23.18",
  "scopeguard",
  "serde",
  "serde_json",

From 4630b709627ffea01dd6863f0db84ec4c693bcd6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 25 Nov 2024 09:25:18 -0500
Subject: [PATCH 44/51] fix(pageserver): ensure all layers are flushed before
 measuring RSS (#9861)

## Problem

close https://github.com/neondatabase/neon/issues/9761

The test assumed that no new L0 layers are flushed throughout the
process, which is not true.

## Summary of changes

Fix the test case `test_compaction_l0_memory` by flushing in-memory
layers before compaction.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/performance/test_compaction.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
index 8868dddf39..0cd1080fa7 100644
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -103,6 +103,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
                     cur.execute(f"update tbl{i} set j = {j};")
 
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(
+        tenant_id, timeline_id, compact=False
+    )  # ^1: flush all in-memory layers
     endpoint.stop()
 
     # Check we have generated the L0 stack we expected
@@ -118,7 +121,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
         return v * 1024
 
     before = rss_hwm()
-    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    pageserver_http.timeline_compact(
+        tenant_id, timeline_id
+    )  # ^1: we must ensure during this process no new L0 layers are flushed
     after = rss_hwm()
 
     log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
@@ -137,7 +142,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
     # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
     # this memory estimate can be revised far downwards to something that doesn't scale
     # linearly with the layer sizes.
-    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
 
     # If we find that compaction is using more memory, this may indicate a regression
     assert compaction_mapped_rss < MEMORY_ESTIMATE

From 3d380acbd1dd878b908df860a541ce77bd4506f3 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 25 Nov 2024 14:43:32 +0000
Subject: [PATCH 45/51] Bump default Debian version to Bookworm everywhere
 (#9863)

## Problem

We have a couple of CI workflows that still run on Debian Bullseye, and
the default Debian version in images is Bullseye as well (we explicitly
set building on Bookworm)

## Summary of changes
- Run `pgbench-pgvector` on Bookworm (fix a couple of packages)
- Run `trigger_bench_on_ec2_machine_in_eu_central_1` on Bookworm
- Change default `DEBIAN_VERSION` in Dockerfiles to Bookworm
- Make `pinned` docker tag an alias to `pinned-bookworm`
---
 .github/workflows/benchmarking.yml            | 14 +++++++-------
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/periodic_pagebench.yml      |  2 +-
 .github/workflows/pin-build-tools-image.yml   |  2 +-
 Dockerfile                                    |  2 +-
 build-tools.Dockerfile                        |  2 +-
 compute/compute-node.Dockerfile               |  2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 2ad1ee0a42..ea8fee80c2 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -541,7 +541,7 @@ jobs:
 
     runs-on: ${{ matrix.RUNNER }}
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -558,12 +558,12 @@ jobs:
         arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
 
         cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg110+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb pg
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
+        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
 
         mkdir -p /tmp/neon/pg_install/v16/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 9e7be76901..93da86a353 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -117,7 +117,7 @@ jobs:
 
       - name: Create multi-arch image
         env:
-          DEFAULT_DEBIAN_VERSION: bullseye
+          DEFAULT_DEBIAN_VERSION: bookworm
           IMAGE_TAG: ${{ needs.check-image.outputs.tag }}
         run: |
           for debian_version in bullseye bookworm; do
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 1cce348ae2..6b98bc873f 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -29,7 +29,7 @@ jobs:
   trigger_bench_on_ec2_machine_in_eu_central_1:
     runs-on: [ self-hosted, small ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index c196d07d3e..5b43d97de6 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -94,7 +94,7 @@ jobs:
 
       - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
         env:
-          DEFAULT_DEBIAN_VERSION: bullseye
+          DEFAULT_DEBIAN_VERSION: bookworm
         run: |
           for debian_version in bullseye bookworm; do
             tags=()
diff --git a/Dockerfile b/Dockerfile
index 785dd4598e..e888efbae2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@ ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG DEFAULT_PG_VERSION=17
 ARG STABLE_PG_VERSION=16
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
 # Build Postgres
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 24e5bbf46f..4f491afec5 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -1,4 +1,4 @@
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7c21c67a0a..2fcd9985bc 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -3,7 +3,7 @@ ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
 #########################################################################################

From 77630e5408c0771b0e1db020f8d81a7be9728391 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 25 Nov 2024 15:59:12 +0100
Subject: [PATCH 46/51] Address beta clippy lint needless_lifetimes (#9877)

The 1.82.0 version of Rust will be stable soon, let's get the clippy
lint fixes in before the compiler version upgrade.
---
 libs/vm_monitor/src/cgroup.rs                               | 4 ++--
 .../virtual_file/owned_buffers_io/aligned_buffer/slice.rs   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 3223765016..1d70cedcf9 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -218,7 +218,7 @@ impl MemoryStatus {
     fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
         struct DS<'a>(&'a [MemoryStatus]);
 
-        impl<'a> Debug for DS<'a> {
+        impl Debug for DS<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                 f.debug_struct("[MemoryStatus]")
                     .field(
@@ -233,7 +233,7 @@ impl MemoryStatus {
 
         struct Fields<'a, F>(&'a [MemoryStatus], F);
 
-        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
+        impl<F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'_, F> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                 f.debug_list().entries(self.0.iter().map(&self.1)).finish()
             }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
index 6cecf34c1c..1952b82578 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
@@ -19,7 +19,7 @@ impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
     }
 }
 
-impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> Deref for AlignedSlice<'_, N, A> {
     type Target = [u8; N];
 
     fn deref(&self) -> &Self::Target {
@@ -27,13 +27,13 @@ impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
     }
 }
 
-impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> DerefMut for AlignedSlice<'_, N, A> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.buf
     }
 }
 
-impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'_, N, A> {
     fn as_ref(&self) -> &[u8; N] {
         self.buf
     }

From 441612c1ce11f92d0fad1226be0e42f5500c2c46 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 25 Nov 2024 17:21:52 +0200
Subject: [PATCH 47/51] Prefetch on macos (#9875)

## Problem

Prefetch is disabled at MacODS because `posix_fadvise` is not available.
But Neon prefetch is not using this function and for testing at MacOS is
it very convenient that prefetch is available.

## Summary of changes

Define `USE_PREFETCH` in Makefile.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 8e3b755112..dc67b87239 100644
--- a/Makefile
+++ b/Makefile
@@ -38,6 +38,7 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
+	PG_CFLAGS += -DUSE_PREFETCH
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable

From 5c2356988e8beea0418e1292da2adf4b988340ad Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 25 Nov 2024 16:52:39 +0100
Subject: [PATCH 48/51] page_service: add benchmark for batching (#9820)

This PR adds two benchmark to demonstrate the effect of server-side
getpage request batching added in
https://github.com/neondatabase/neon/pull/9321.

For the CPU usage, I found the the `prometheus` crate's built-in CPU
usage accounts the seconds at integer granularity. That's not enough you
reduce the target benchmark runtime for local iteration. So, add a new
`libmetrics` metric and report that.

The benchmarks are disabled because [on our benchmark nodes, timer
resolution isn't high
enough](https://neondb.slack.com/archives/C059ZC138NR/p1732264223207449).
They work (no statement about quality) on my bare-metal devbox.

They will be refined and enabled once we find a fix. Candidates at time
of writing are:
- https://github.com/neondatabase/neon/pull/9822
- https://github.com/neondatabase/neon/pull/9851


Refs:

- Epic: https://github.com/neondatabase/neon/issues/9376
- Extracted from https://github.com/neondatabase/neon/pull/9792
---
 libs/metrics/src/more_process_metrics.rs      |  40 ++-
 test_runner/performance/README.md             |   3 +-
 .../test_pageserver_getpage_merge.py          | 307 ++++++++++++++++++
 3 files changed, 347 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/performance/pageserver/test_pageserver_getpage_merge.py

diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs
index 920724fdec..13a745e031 100644
--- a/libs/metrics/src/more_process_metrics.rs
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -2,14 +2,28 @@
 
 // This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
 
+use once_cell::sync::Lazy;
+use prometheus::Gauge;
+
 use crate::UIntGauge;
 
 pub struct Collector {
     descs: Vec<prometheus::core::Desc>,
     vmlck: crate::UIntGauge,
+    cpu_seconds_highres: Gauge,
 }
 
-const NMETRICS: usize = 1;
+const NMETRICS: usize = 2;
+
+static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
+    let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
+    if long == -1 {
+        panic!("sysconf(_SC_CLK_TCK) failed");
+    }
+    let convertible_to_f64: i32 =
+        i32::try_from(long).expect("sysconf(_SC_CLK_TCK) is larger than i32");
+    convertible_to_f64 as f64
+});
 
 impl prometheus::core::Collector for Collector {
     fn desc(&self) -> Vec<&prometheus::core::Desc> {
@@ -27,6 +41,12 @@ impl prometheus::core::Collector for Collector {
                 mfs.extend(self.vmlck.collect())
             }
         }
+        if let Ok(stat) = myself.stat() {
+            let cpu_seconds = stat.utime + stat.stime;
+            self.cpu_seconds_highres
+                .set(cpu_seconds as f64 / *CLK_TCK_F64);
+            mfs.extend(self.cpu_seconds_highres.collect());
+        }
         mfs
     }
 }
@@ -43,7 +63,23 @@ impl Collector {
                 .cloned(),
         );
 
-        Self { descs, vmlck }
+        let cpu_seconds_highres = Gauge::new(
+            "libmetrics_process_cpu_seconds_highres",
+            "Total user and system CPU time spent in seconds.\
+             Sub-second resolution, hence better than `process_cpu_seconds_total`.",
+        )
+        .unwrap();
+        descs.extend(
+            prometheus::core::Collector::desc(&cpu_seconds_highres)
+                .into_iter()
+                .cloned(),
+        );
+
+        Self {
+            descs,
+            vmlck,
+            cpu_seconds_highres,
+        }
     }
 }
 
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 70d75a6dcf..85096d3770 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -15,6 +15,7 @@ Some handy pytest flags for local development:
 - `-k` selects a test to run
 - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
 - `--preserve-database-files` to skip cleanup
+- `--out-dir` to produce a JSON with the recorded test metrics
 
 # What performance tests do we have and how we run them
 
@@ -36,6 +37,6 @@ All tests run only once. Usually to obtain more consistent performance numbers,
 
 ## Results collection
 
-Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
+Local test results for main branch, and results of daily performance tests, are stored in a [neon project](https://console.neon.tech/app/projects/withered-sky-69117821) deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
 
 There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing.
diff --git a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py b/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
new file mode 100644
index 0000000000..34cce9900b
--- /dev/null
+++ b/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
@@ -0,0 +1,307 @@
+import dataclasses
+import json
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.utils import humantime_to_ms
+
+TARGET_RUNTIME = 60
+
+
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+@pytest.mark.parametrize(
+    "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
+    [
+        # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout
+        (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"),
+        (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"),
+        (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"),
+        # the next 4 cases demonstrate how batchable workloads benefit from batching
+        (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"),
+        (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"),
+        (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"),
+        (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"),
+    ],
+)
+def test_getpage_merge_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    tablesize_mib: int,
+    batch_timeout: str | None,
+    target_runtime: int,
+    effective_io_concurrency: int,
+    readhead_buffer_size: int,
+    name: str,
+):
+    """
+    Do a bunch of sequential scans and ensure that the pageserver does some merging.
+    """
+
+    #
+    # record perf-related parameters as metrics to simplify processing of results
+    #
+    params: dict[str, tuple[float | int, dict[str, Any]]] = {}
+
+    params.update(
+        {
+            "tablesize_mib": (tablesize_mib, {"unit": "MiB"}),
+            "batch_timeout": (
+                -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout),
+                {"unit": "us"},
+            ),
+            # target_runtime is just a polite ask to the workload to run for this long
+            "effective_io_concurrency": (effective_io_concurrency, {}),
+            "readhead_buffer_size": (readhead_buffer_size, {}),
+            # name is not a metric
+        }
+    )
+
+    log.info("params: %s", params)
+
+    for param, (value, kwargs) in params.items():
+        zenbenchmark.record(
+            param,
+            metric_value=value,
+            unit=kwargs.pop("unit", ""),
+            report=MetricReport.TEST_PARAM,
+            **kwargs,
+        )
+
+    #
+    # Setup
+    #
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    endpoint = env.endpoints.create_start("main")
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
+    cur.execute(f"SET effective_io_concurrency={effective_io_concurrency}")
+    cur.execute(
+        f"SET neon.readahead_buffer_size={readhead_buffer_size}"
+    )  # this is the current default value, but let's hard-code that
+
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+
+    log.info("Filling the table")
+    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
+    tablesize = tablesize_mib * 1024 * 1024
+    npages = tablesize // (8 * 1024)
+    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
+    # TODO: can we force postgres to do sequential scans?
+
+    #
+    # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
+    #
+
+    @dataclass
+    class Metrics:
+        time: float
+        pageserver_getpage_count: float
+        pageserver_vectored_get_count: float
+        compute_getpage_count: float
+        pageserver_cpu_seconds_total: float
+
+        def __sub__(self, other: "Metrics") -> "Metrics":
+            return Metrics(
+                time=self.time - other.time,
+                pageserver_getpage_count=self.pageserver_getpage_count
+                - other.pageserver_getpage_count,
+                pageserver_vectored_get_count=self.pageserver_vectored_get_count
+                - other.pageserver_vectored_get_count,
+                compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
+                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
+                - other.pageserver_cpu_seconds_total,
+            )
+
+        def normalize(self, by) -> "Metrics":
+            return Metrics(
+                time=self.time / by,
+                pageserver_getpage_count=self.pageserver_getpage_count / by,
+                pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
+                compute_getpage_count=self.compute_getpage_count / by,
+                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
+            )
+
+    def get_metrics() -> Metrics:
+        with conn.cursor() as cur:
+            cur.execute(
+                "select value from neon_perf_counters where metric='getpage_wait_seconds_count';"
+            )
+            compute_getpage_count = cur.fetchall()[0][0]
+            pageserver_metrics = ps_http.get_metrics()
+            return Metrics(
+                time=time.time(),
+                pageserver_getpage_count=pageserver_metrics.query_one(
+                    "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
+                ).value,
+                pageserver_vectored_get_count=pageserver_metrics.query_one(
+                    "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
+                ).value,
+                compute_getpage_count=compute_getpage_count,
+                pageserver_cpu_seconds_total=pageserver_metrics.query_one(
+                    "libmetrics_process_cpu_seconds_highres"
+                ).value,
+            )
+
+    def workload() -> Metrics:
+        start = time.time()
+        iters = 0
+        while time.time() - start < target_runtime or iters < 2:
+            log.info("Seqscan %d", iters)
+            if iters == 1:
+                # round zero for warming up
+                before = get_metrics()
+            cur.execute(
+                "select clear_buffer_cache()"
+            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
+            cur.execute("select sum(data::bigint) from t")
+            assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
+            iters += 1
+        after = get_metrics()
+        return (after - before).normalize(iters - 1)
+
+    env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout})
+    env.pageserver.restart()
+    metrics = workload()
+
+    log.info("Results: %s", metrics)
+
+    #
+    # Sanity-checks on the collected data
+    #
+    # assert that getpage counts roughly match between compute and ps
+    assert metrics.pageserver_getpage_count == pytest.approx(
+        metrics.compute_getpage_count, rel=0.01
+    )
+
+    #
+    # Record the results
+    #
+
+    for metric, value in dataclasses.asdict(metrics).items():
+        zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
+
+    zenbenchmark.record(
+        "perfmetric.batching_factor",
+        metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+@pytest.mark.parametrize(
+    "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"]
+)
+def test_timer_precision(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    batch_timeout: str | None,
+):
+    """
+    Determine the batching timeout precision (mean latency) and tail latency impact.
+
+    The baseline is `None`; an ideal batching timeout implementation would increase
+    the mean latency by exactly `batch_timeout`.
+
+    That is not the case with the current implementation, will be addressed in future changes.
+    """
+
+    #
+    # Setup
+    #
+
+    def patch_ps_config(ps_config):
+        ps_config["server_side_batch_timeout"] = batch_timeout
+
+    neon_env_builder.pageserver_config_override = patch_ps_config
+
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
+    cur.execute("SET effective_io_concurrency=1")
+
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+
+    log.info("Filling the table")
+    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
+    tablesize = 50 * 1024 * 1024
+    npages = tablesize // (8 * 1024)
+    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
+    # TODO: can we force postgres to do sequential scans?
+
+    cur.close()
+    conn.close()
+
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    endpoint.stop()
+
+    for sk in env.safekeepers:
+        sk.stop()
+
+    #
+    # Run single-threaded pagebench (TODO: dedup with other benchmark code)
+    #
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "get-page-latest-lsn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--num-clients",
+        "1",
+        "--runtime",
+        "10s",
+    ]
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path) as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+
+    metric = "latency_mean"
+    zenbenchmark.record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        zenbenchmark.record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )

From 7a2f0ed8d452cd05bb1b9a85f1cda6e00ead1f85 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 25 Nov 2024 17:29:28 +0000
Subject: [PATCH 49/51] safekeeper: lift decoding and interpretation of WAL to
 the safekeeper (#9746)

## Problem

For any given tenant shard, pageservers receive all of the tenant's WAL
from the safekeeper.
This soft-blocks us from using larger shard counts due to bandwidth
concerns and CPU overhead of filtering
out the records.

## Summary of changes

This PR lifts the decoding and interpretation of WAL from the pageserver
into the safekeeper.

A customised PG replication protocol is used where instead of sending
raw WAL, the safekeeper sends
filtered, interpreted records. The receiver drives the protocol
selection, so, on the pageserver side, usage
of the new protocol is gated by a new pageserver config:
`wal_receiver_protocol`.

 More granularly the changes are:
1. Optionally inject the protocol and shard identity into the arguments
used for starting replication
2. On the safekeeper side, implement a new wal sending primitive which
decodes and interprets records
 before sending them over
3. On the pageserver side, implement the ingestion of this new
replication message type. It's very similar
 to what we already have for raw wal (minus decoding and interpreting).

 ## Notes

* This PR currently uses my [branch of
rust-postgres](https://github.com/neondatabase/rust-postgres/tree/vlad/interpreted-wal-record-replication-support)
which includes the deserialization logic for the new replication message
type. PR for that is open
[here](https://github.com/neondatabase/rust-postgres/pull/32).
* This PR contains changes for both pageservers and safekeepers. It's
safe to merge because the new protocol is disabled by default on the
pageserver side. We can gradually start enabling it in subsequent
releases.
* CI tests are running on https://github.com/neondatabase/neon/pull/9747

 ## Links

 Related: https://github.com/neondatabase/neon/issues/9336
 Epic: https://github.com/neondatabase/neon/issues/9329
---
 Cargo.lock                                    |  11 +-
 libs/pageserver_api/src/config.rs             |   7 +-
 libs/pq_proto/src/lib.rs                      |  36 +++++
 libs/utils/Cargo.toml                         |   1 +
 libs/utils/src/postgres_client.rs             |  95 +++++++++--
 libs/wal_decoder/src/models.rs                |  12 ++
 libs/wal_decoder/src/serialized_batch.rs      |  15 +-
 pageserver/src/bin/pageserver.rs              |   1 +
 pageserver/src/config.rs                      |   5 +
 pageserver/src/pgdatadir_mapping.rs           |   7 +-
 pageserver/src/tenant/timeline.rs             |   3 +-
 pageserver/src/tenant/timeline/walreceiver.rs |   2 +
 .../walreceiver/connection_manager.rs         |  40 +++--
 .../walreceiver/walreceiver_connection.rs     | 149 ++++++++++++++++--
 safekeeper/Cargo.toml                         |   2 +
 safekeeper/src/handler.rs                     |  79 ++++++++++
 safekeeper/src/lib.rs                         |   2 +
 safekeeper/src/recovery.rs                    |  13 +-
 safekeeper/src/send_interpreted_wal.rs        | 121 ++++++++++++++
 safekeeper/src/send_wal.rs                    | 122 +++++++++++---
 safekeeper/src/wal_reader_stream.rs           | 149 ++++++++++++++++++
 .../performance/test_sharded_ingest.py        |  50 +++++-
 test_runner/regress/test_compaction.py        |   7 +-
 test_runner/regress/test_crafted_wal_end.py   |   9 +-
 test_runner/regress/test_subxacts.py          |  12 +-
 .../regress/test_wal_acceptor_async.py        |   6 +-
 26 files changed, 870 insertions(+), 86 deletions(-)
 create mode 100644 safekeeper/src/send_interpreted_wal.rs
 create mode 100644 safekeeper/src/wal_reader_stream.rs

diff --git a/Cargo.lock b/Cargo.lock
index 98d2e0864a..c1a14210de 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4133,7 +4133,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4146,7 +4146,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4165,7 +4165,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -5364,6 +5364,7 @@ dependencies = [
  "itertools 0.10.5",
  "metrics",
  "once_cell",
+ "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
  "postgres-protocol",
@@ -5395,6 +5396,7 @@ dependencies = [
  "tracing-subscriber",
  "url",
  "utils",
+ "wal_decoder",
  "walproposer",
  "workspace_hack",
 ]
@@ -6466,7 +6468,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -7021,6 +7023,7 @@ dependencies = [
  "serde_assert",
  "serde_json",
  "serde_path_to_error",
+ "serde_with",
  "signal-hook",
  "strum",
  "strum_macros",
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 7666728427..0abca5cdc2 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -18,7 +18,7 @@ use std::{
     str::FromStr,
     time::Duration,
 };
-use utils::logging::LogFormat;
+use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol};
 
 use crate::models::ImageCompressionAlgorithm;
 use crate::models::LsnLease;
@@ -120,6 +120,7 @@ pub struct ConfigToml {
     pub no_sync: Option<bool>,
     #[serde(with = "humantime_serde")]
     pub server_side_batch_timeout: Option<Duration>,
+    pub wal_receiver_protocol: PostgresClientProtocol,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -330,6 +331,9 @@ pub mod defaults {
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
 
     pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None;
+
+    pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
+        utils::postgres_client::PostgresClientProtocol::Vanilla;
 }
 
 impl Default for ConfigToml {
@@ -418,6 +422,7 @@ impl Default for ConfigToml {
                 .map(|duration| humantime::parse_duration(duration).unwrap()),
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
+            wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
         }
     }
 }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 6c40968496..b7871ab01f 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -562,6 +562,9 @@ pub enum BeMessage<'a> {
         options: &'a [&'a str],
     },
     KeepAlive(WalSndKeepAlive),
+    /// Batch of interpreted, shard filtered WAL records,
+    /// ready for the pageserver to ingest
+    InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
 }
 
 /// Common shorthands.
@@ -672,6 +675,25 @@ pub struct WalSndKeepAlive {
     pub request_reply: bool,
 }
 
+/// Batch of interpreted WAL records used in the interpreted
+/// safekeeper to pageserver protocol.
+///
+/// Note that the pageserver uses the RawInterpretedWalRecordsBody
+/// counterpart of this from the neondatabase/rust-postgres repo.
+/// If you're changing this struct, you likely need to change its
+/// twin as well.
+#[derive(Debug)]
+pub struct InterpretedWalRecordsBody<'a> {
+    /// End of raw WAL in [`Self::data`]
+    pub streaming_lsn: u64,
+    /// Current end of WAL on the server
+    pub commit_lsn: u64,
+    /// Start LSN of the next record in PG WAL.
+    /// Is 0 if the portion of PG WAL did not contain any records.
+    pub next_record_lsn: u64,
+    pub data: &'a [u8],
+}
+
 pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]);
 
 // single text column
@@ -996,6 +1018,20 @@ impl BeMessage<'_> {
                     Ok(())
                 })?
             }
+
+            BeMessage::InterpretedWalRecords(rec) => {
+                // We use the COPY_DATA_TAG for our custom message
+                // since this tag is interpreted as raw bytes.
+                buf.put_u8(b'd');
+                write_body(buf, |buf| {
+                    buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol
+                                      // dependency
+                    buf.put_u64(rec.streaming_lsn);
+                    buf.put_u64(rec.commit_lsn);
+                    buf.put_u64(rec.next_record_lsn);
+                    buf.put_slice(rec.data);
+                });
+            }
         }
         Ok(())
     }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 4aad0aee2c..f440b81d8f 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -33,6 +33,7 @@ pprof.workspace = true
 regex.workspace = true
 routerify.workspace = true
 serde.workspace = true
+serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
diff --git a/libs/utils/src/postgres_client.rs b/libs/utils/src/postgres_client.rs
index dba74f5b0b..3073bbde4c 100644
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -7,29 +7,94 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};
 
 use crate::id::TenantTimelineId;
 
+/// Postgres client protocol types
+#[derive(
+    Copy,
+    Clone,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+    Debug,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum PostgresClientProtocol {
+    /// Usual Postgres replication protocol
+    Vanilla,
+    /// Custom shard-aware protocol that replicates interpreted records.
+    /// Used to send wal from safekeeper to pageserver.
+    Interpreted,
+}
+
+impl TryFrom<u8> for PostgresClientProtocol {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        Ok(match value {
+            v if v == (PostgresClientProtocol::Vanilla as u8) => PostgresClientProtocol::Vanilla,
+            v if v == (PostgresClientProtocol::Interpreted as u8) => {
+                PostgresClientProtocol::Interpreted
+            }
+            x => return Err(x),
+        })
+    }
+}
+
+pub struct ConnectionConfigArgs<'a> {
+    pub protocol: PostgresClientProtocol,
+
+    pub ttid: TenantTimelineId,
+    pub shard_number: Option<u8>,
+    pub shard_count: Option<u8>,
+    pub shard_stripe_size: Option<u32>,
+
+    pub listen_pg_addr_str: &'a str,
+
+    pub auth_token: Option<&'a str>,
+    pub availability_zone: Option<&'a str>,
+}
+
+impl<'a> ConnectionConfigArgs<'a> {
+    fn options(&'a self) -> Vec<String> {
+        let mut options = vec![
+            "-c".to_owned(),
+            format!("timeline_id={}", self.ttid.timeline_id),
+            format!("tenant_id={}", self.ttid.tenant_id),
+            format!("protocol={}", self.protocol as u8),
+        ];
+
+        if self.shard_number.is_some() {
+            assert!(self.shard_count.is_some());
+            assert!(self.shard_stripe_size.is_some());
+
+            options.push(format!("shard_count={}", self.shard_count.unwrap()));
+            options.push(format!("shard_number={}", self.shard_number.unwrap()));
+            options.push(format!(
+                "shard_stripe_size={}",
+                self.shard_stripe_size.unwrap()
+            ));
+        }
+
+        options
+    }
+}
+
 /// Create client config for fetching WAL from safekeeper on particular timeline.
 /// listen_pg_addr_str is in form host:\[port\].
 pub fn wal_stream_connection_config(
-    TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    }: TenantTimelineId,
-    listen_pg_addr_str: &str,
-    auth_token: Option<&str>,
-    availability_zone: Option<&str>,
+    args: ConnectionConfigArgs,
 ) -> anyhow::Result<PgConnectionConfig> {
     let (host, port) =
-        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+        parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
     let port = port.unwrap_or(5432);
     let mut connstr = PgConnectionConfig::new_host_port(host, port)
-        .extend_options([
-            "-c".to_owned(),
-            format!("timeline_id={}", timeline_id),
-            format!("tenant_id={}", tenant_id),
-        ])
-        .set_password(auth_token.map(|s| s.to_owned()));
+        .extend_options(args.options())
+        .set_password(args.auth_token.map(|s| s.to_owned()));
 
-    if let Some(availability_zone) = availability_zone {
+    if let Some(availability_zone) = args.availability_zone {
         connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
     }
 
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index c69f8c869a..7ac425cb5f 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -65,6 +65,18 @@ pub struct InterpretedWalRecord {
     pub xid: TransactionId,
 }
 
+impl InterpretedWalRecord {
+    /// Checks if the WAL record is empty
+    ///
+    /// An empty interpreted WAL record has no data or metadata and does not have to be sent to the
+    /// pageserver.
+    pub fn is_empty(&self) -> bool {
+        self.batch.is_empty()
+            && self.metadata_record.is_none()
+            && matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
+    }
+}
+
 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
 #[derive(Serialize, Deserialize)]
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
index 9c0708ebbe..41294da7a0 100644
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -496,11 +496,16 @@ impl SerializedValueBatch {
         }
     }
 
-    /// Checks if the batch is empty
-    ///
-    /// A batch is empty when it contains no serialized values.
-    /// Note that it may still contain observed values.
+    /// Checks if the batch contains any serialized or observed values
     pub fn is_empty(&self) -> bool {
+        !self.has_data() && self.metadata.is_empty()
+    }
+
+    /// Checks if the batch contains data
+    ///
+    /// Note that if this returns false, it may still contain observed values or
+    /// a metadata record.
+    pub fn has_data(&self) -> bool {
         let empty = self.raw.is_empty();
 
         if cfg!(debug_assertions) && empty {
@@ -510,7 +515,7 @@ impl SerializedValueBatch {
                 .all(|meta| matches!(meta, ValueMeta::Observed(_))));
         }
 
-        empty
+        !empty
     }
 
     /// Returns the number of values serialized in the batch
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 033a9a4619..a8c2c2e992 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
+    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 59ea6fb941..2cf237e72b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -14,6 +14,7 @@ use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
 use utils::logging::SecretString;
+use utils::postgres_client::PostgresClientProtocol;
 
 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -190,6 +191,8 @@ pub struct PageServerConf {
     /// Maximum amount of time for which a get page request request
     /// might be held up for request merging.
     pub server_side_batch_timeout: Option<Duration>,
+
+    pub wal_receiver_protocol: PostgresClientProtocol,
 }
 
 /// Token for authentication to safekeepers
@@ -350,6 +353,7 @@ impl PageServerConf {
             server_side_batch_timeout,
             tenant_config,
             no_sync,
+            wal_receiver_protocol,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -393,6 +397,7 @@ impl PageServerConf {
             import_pgdata_upcall_api,
             import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
             import_pgdata_aws_endpoint_url,
+            wal_receiver_protocol,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f4f184be5a..c491bfe650 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1229,10 +1229,9 @@ impl<'a> DatadirModification<'a> {
     }
 
     pub(crate) fn has_dirty_data(&self) -> bool {
-        !self
-            .pending_data_batch
+        self.pending_data_batch
             .as_ref()
-            .map_or(true, |b| b.is_empty())
+            .map_or(false, |b| b.has_data())
     }
 
     /// Set the current lsn
@@ -1408,7 +1407,7 @@ impl<'a> DatadirModification<'a> {
             Some(pending_batch) => {
                 pending_batch.extend(batch);
             }
-            None if !batch.is_empty() => {
+            None if batch.has_data() => {
                 self.pending_data_batch = Some(batch);
             }
             None => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4881be33a6..f6a06e73a7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2470,6 +2470,7 @@ impl Timeline {
         *guard = Some(WalReceiver::start(
             Arc::clone(self),
             WalReceiverConf {
+                protocol: self.conf.wal_receiver_protocol,
                 wal_connect_timeout,
                 lagging_wal_timeout,
                 max_lsn_wal_lag,
@@ -5896,7 +5897,7 @@ impl<'a> TimelineWriter<'a> {
         batch: SerializedValueBatch,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        if batch.is_empty() {
+        if !batch.has_data() {
             return Ok(());
         }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 4a3a5c621b..f831f5e48a 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -38,6 +38,7 @@ use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::postgres_client::PostgresClientProtocol;
 
 use self::connection_manager::ConnectionManagerStatus;
 
@@ -45,6 +46,7 @@ use super::Timeline;
 
 #[derive(Clone)]
 pub struct WalReceiverConf {
+    pub protocol: PostgresClientProtocol,
     /// The timeout on the connection to safekeeper for WAL streaming.
     pub wal_connect_timeout: Duration,
     /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index de50f217d8..7a64703a30 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig;
 use utils::backoff::{
     exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::postgres_client::wal_stream_connection_config;
+use utils::postgres_client::{
+    wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol,
+};
 use utils::{
     id::{NodeId, TenantTimelineId},
     lsn::Lsn,
@@ -984,15 +986,33 @@ impl ConnectionManagerState {
                 if info.safekeeper_connstr.is_empty() {
                     return None; // no connection string, ignore sk
                 }
-                match wal_stream_connection_config(
-                    self.id,
-                    info.safekeeper_connstr.as_ref(),
-                    match &self.conf.auth_token {
-                        None => None,
-                        Some(x) => Some(x),
+
+                let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
+                    PostgresClientProtocol::Vanilla => {
+                        (None, None, None)
                     },
-                    self.conf.availability_zone.as_deref(),
-                ) {
+                    PostgresClientProtocol::Interpreted => {
+                        let shard_identity = self.timeline.get_shard_identity();
+                        (
+                            Some(shard_identity.number.0),
+                            Some(shard_identity.count.0),
+                            Some(shard_identity.stripe_size.0),
+                        )
+                    }
+                };
+
+                let connection_conf_args = ConnectionConfigArgs {
+                    protocol: self.conf.protocol,
+                    ttid: self.id,
+                    shard_number,
+                    shard_count,
+                    shard_stripe_size,
+                    listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
+                    auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
+                    availability_zone: self.conf.availability_zone.as_deref()
+                };
+
+                match wal_stream_connection_config(connection_conf_args) {
                     Ok(connstr) => Some((*sk_id, info, connstr)),
                     Err(e) => {
                         error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
@@ -1096,6 +1116,7 @@ impl ReconnectReason {
 mod tests {
     use super::*;
     use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
     use url::Host;
 
     fn dummy_broker_sk_timeline(
@@ -1532,6 +1553,7 @@ mod tests {
             timeline,
             cancel: CancellationToken::new(),
             conf: WalReceiverConf {
+                protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
                 wal_connect_timeout: Duration::from_secs(1),
                 lagging_wal_timeout: Duration::from_secs(1),
                 max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 6ac6920d47..1a0e66ceb3 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -36,7 +36,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn};
+use utils::{bin_ser::BeSer, id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
 /// Status of the connection.
@@ -291,6 +291,15 @@ pub(super) async fn handle_walreceiver_connection(
                 connection_status.latest_connection_update = now;
                 connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
             }
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                connection_status.latest_connection_update = now;
+                if !raw.data().is_empty() {
+                    connection_status.latest_wal_update = now;
+                }
+
+                connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn()));
+                connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn()));
+            }
             &_ => {}
         };
         if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
@@ -298,7 +307,130 @@ pub(super) async fn handle_walreceiver_connection(
             return Ok(());
         }
 
+        async fn commit(
+            modification: &mut DatadirModification<'_>,
+            uncommitted: &mut u64,
+            filtered: &mut u64,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<()> {
+            WAL_INGEST
+                .records_committed
+                .inc_by(*uncommitted - *filtered);
+            modification.commit(ctx).await?;
+            *uncommitted = 0;
+            *filtered = 0;
+            Ok(())
+        }
+
         let status_update = match replication_message {
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
+
+                let mut uncommitted_records = 0;
+                let mut filtered_records = 0;
+
+                // This is the end LSN of the raw WAL from which the records
+                // were interpreted.
+                let streaming_lsn = Lsn::from(raw.streaming_lsn());
+                tracing::debug!(
+                    "Received WAL up to {streaming_lsn} with next_record_lsn={}",
+                    Lsn(raw.next_record_lsn().unwrap_or(0))
+                );
+
+                let records = Vec::<InterpretedWalRecord>::des(raw.data()).with_context(|| {
+                    anyhow::anyhow!(
+                        "Failed to deserialize interpreted records ending at LSN {streaming_lsn}"
+                    )
+                })?;
+
+                // We start the modification at 0 because each interpreted record
+                // advances it to its end LSN. 0 is just an initialization placeholder.
+                let mut modification = timeline.begin_modification(Lsn(0));
+
+                for interpreted in records {
+                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
+                        && uncommitted_records > 0
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+
+                    let next_record_lsn = interpreted.next_record_lsn;
+                    let ingested = walingest
+                        .ingest_record(interpreted, &mut modification, &ctx)
+                        .await
+                        .with_context(|| format!("could not ingest record at {next_record_lsn}"))?;
+
+                    if !ingested {
+                        tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
+                        WAL_INGEST.records_filtered.inc();
+                        filtered_records += 1;
+                    }
+
+                    uncommitted_records += 1;
+
+                    // FIXME: this cannot be made pausable_failpoint without fixing the
+                    // failpoint library; in tests, the added amount of debugging will cause us
+                    // to timeout the tests.
+                    fail_point!("walreceiver-after-ingest");
+
+                    // Commit every ingest_batch_size records. Even if we filtered out
+                    // all records, we still need to call commit to advance the LSN.
+                    if uncommitted_records >= ingest_batch_size
+                        || modification.approx_pending_bytes()
+                            > DatadirModification::MAX_PENDING_BYTES
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+                }
+
+                // Records might have been filtered out on the safekeeper side, but we still
+                // need to advance last record LSN on all shards. If we've not ingested the latest
+                // record, then set the LSN of the modification past it. This way all shards
+                // advance their last record LSN at the same time.
+                let needs_last_record_lsn_advance = match raw.next_record_lsn().map(Lsn::from) {
+                    Some(lsn) if lsn > modification.get_lsn() => {
+                        modification.set_lsn(lsn).unwrap();
+                        true
+                    }
+                    _ => false,
+                };
+
+                if uncommitted_records > 0 || needs_last_record_lsn_advance {
+                    // Commit any uncommitted records
+                    commit(
+                        &mut modification,
+                        &mut uncommitted_records,
+                        &mut filtered_records,
+                        &ctx,
+                    )
+                    .await?;
+                }
+
+                if !caught_up && streaming_lsn >= end_of_wal {
+                    info!("caught up at LSN {streaming_lsn}");
+                    caught_up = true;
+                }
+
+                tracing::debug!(
+                    "Ingested WAL up to {streaming_lsn}. Last record LSN is {}",
+                    timeline.get_last_record_lsn()
+                );
+
+                Some(streaming_lsn)
+            }
+
             ReplicationMessage::XLogData(xlog_data) => {
                 // Pass the WAL data to the decoder, and see if we can decode
                 // more records as a result.
@@ -316,21 +448,6 @@ pub(super) async fn handle_walreceiver_connection(
                     let mut uncommitted_records = 0;
                     let mut filtered_records = 0;
 
-                    async fn commit(
-                        modification: &mut DatadirModification<'_>,
-                        uncommitted: &mut u64,
-                        filtered: &mut u64,
-                        ctx: &RequestContext,
-                    ) -> anyhow::Result<()> {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(*uncommitted - *filtered);
-                        modification.commit(ctx).await?;
-                        *uncommitted = 0;
-                        *filtered = 0;
-                        Ok(())
-                    }
-
                     while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index ab77b63d54..635a9222e1 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -28,6 +28,7 @@ hyper0.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
+pageserver_api.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
 pprof.workspace = true
@@ -58,6 +59,7 @@ sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
+wal_decoder.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 3f00b69cde..cec7c3c7ee 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,11 +2,15 @@
 //! protocol commands.
 
 use anyhow::Context;
+use pageserver_api::models::ShardParameters;
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
+use utils::postgres_client::PostgresClientProtocol;
+use utils::shard::{ShardCount, ShardNumber};
 
 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -35,6 +39,8 @@ pub struct SafekeeperPostgresHandler {
     pub tenant_id: Option<TenantId>,
     pub timeline_id: Option<TimelineId>,
     pub ttid: TenantTimelineId,
+    pub shard: Option<ShardIdentity>,
+    pub protocol: Option<PostgresClientProtocol>,
     /// Unique connection id is logged in spans for observability.
     pub conn_id: ConnectionId,
     /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
@@ -107,11 +113,28 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
     ) -> Result<(), QueryError> {
         if let FeStartupPacket::StartupMessage { params, .. } = sm {
             if let Some(options) = params.options_raw() {
+                let mut shard_count: Option<u8> = None;
+                let mut shard_number: Option<u8> = None;
+                let mut shard_stripe_size: Option<u32> = None;
+
                 for opt in options {
                     // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy,
                     // remove these after the PR gets deployed:
                     // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                     match opt.split_once('=') {
+                        Some(("protocol", value)) => {
+                            let raw_value = value
+                                .parse::<u8>()
+                                .with_context(|| format!("Failed to parse {value} as protocol"))?;
+
+                            self.protocol = Some(
+                                PostgresClientProtocol::try_from(raw_value).map_err(|_| {
+                                    QueryError::Other(anyhow::anyhow!(
+                                        "Unexpected client protocol type: {raw_value}"
+                                    ))
+                                })?,
+                            );
+                        }
                         Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
                             self.tenant_id = Some(value.parse().with_context(|| {
                                 format!("Failed to parse {value} as tenant id")
@@ -127,9 +150,54 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                                 metrics.set_client_az(client_az)
                             }
                         }
+                        Some(("shard_count", value)) => {
+                            shard_count = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard count")
+                            })?);
+                        }
+                        Some(("shard_number", value)) => {
+                            shard_number = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard number")
+                            })?);
+                        }
+                        Some(("shard_stripe_size", value)) => {
+                            shard_stripe_size = Some(value.parse::<u32>().with_context(|| {
+                                format!("Failed to parse {value} as shard stripe size")
+                            })?);
+                        }
                         _ => continue,
                     }
                 }
+
+                match self.protocol() {
+                    PostgresClientProtocol::Vanilla => {
+                        if shard_count.is_some()
+                            || shard_number.is_some()
+                            || shard_stripe_size.is_some()
+                        {
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "Shard params specified for vanilla protocol"
+                            )));
+                        }
+                    }
+                    PostgresClientProtocol::Interpreted => {
+                        match (shard_count, shard_number, shard_stripe_size) {
+                            (Some(count), Some(number), Some(stripe_size)) => {
+                                let params = ShardParameters {
+                                    count: ShardCount(count),
+                                    stripe_size: ShardStripeSize(stripe_size),
+                                };
+                                self.shard =
+                                    Some(ShardIdentity::from_params(ShardNumber(number), &params));
+                            }
+                            _ => {
+                                return Err(QueryError::Other(anyhow::anyhow!(
+                                    "Shard params were not specified"
+                                )));
+                            }
+                        }
+                    }
+                }
             }
 
             if let Some(app_name) = params.get("application_name") {
@@ -150,6 +218,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                     tracing::field::debug(self.appname.clone()),
                 );
 
+            if let Some(shard) = self.shard.as_ref() {
+                tracing::Span::current()
+                    .record("shard", tracing::field::display(shard.shard_slug()));
+            }
+
             Ok(())
         } else {
             Err(QueryError::Other(anyhow::anyhow!(
@@ -258,6 +331,8 @@ impl SafekeeperPostgresHandler {
             tenant_id: None,
             timeline_id: None,
             ttid: TenantTimelineId::empty(),
+            shard: None,
+            protocol: None,
             conn_id,
             claims: None,
             auth,
@@ -265,6 +340,10 @@ impl SafekeeperPostgresHandler {
         }
     }
 
+    pub fn protocol(&self) -> PostgresClientProtocol {
+        self.protocol.unwrap_or(PostgresClientProtocol::Vanilla)
+    }
+
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
     fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 6d68b6b59b..abe6e00a66 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -29,6 +29,7 @@ pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
 pub mod safekeeper;
+pub mod send_interpreted_wal;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
@@ -38,6 +39,7 @@ pub mod timeline_manager;
 pub mod timelines_set;
 pub mod wal_backup;
 pub mod wal_backup_partial;
+pub mod wal_reader_stream;
 pub mod wal_service;
 pub mod wal_storage;
 
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 9c4149d8f1..7b87166aa0 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -17,6 +17,7 @@ use tokio::{
 use tokio_postgres::replication::ReplicationStream;
 use tokio_postgres::types::PgLsn;
 use tracing::*;
+use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol};
 use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
 
 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
@@ -325,7 +326,17 @@ async fn recovery_stream(
     conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
     // TODO: pass auth token
-    let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
+    let connection_conf_args = ConnectionConfigArgs {
+        protocol: PostgresClientProtocol::Vanilla,
+        ttid: tli.ttid,
+        shard_number: None,
+        shard_count: None,
+        shard_stripe_size: None,
+        listen_pg_addr_str: &donor.pg_connstr,
+        auth_token: None,
+        availability_zone: None,
+    };
+    let cfg = wal_stream_connection_config(connection_conf_args)?;
     let mut cfg = cfg.to_tokio_postgres_config();
     // It will make safekeeper give out not committed WAL (up to flush_lsn).
     cfg.application_name(&format!("safekeeper_{}", conf.my_id));
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
new file mode 100644
index 0000000000..cf0ee276e9
--- /dev/null
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -0,0 +1,121 @@
+use std::time::Duration;
+
+use anyhow::Context;
+use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
+use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
+use postgres_ffi::MAX_SEND_SIZE;
+use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
+use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::time::MissedTickBehavior;
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+use wal_decoder::models::InterpretedWalRecord;
+
+use crate::send_wal::EndWatchView;
+use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
+
+/// Shard-aware interpreted record sender.
+/// This is used for sending WAL to the pageserver. Said WAL
+/// is pre-interpreted and filtered for the shard.
+pub(crate) struct InterpretedWalSender<'a, IO> {
+    pub(crate) pgb: &'a mut PostgresBackend<IO>,
+    pub(crate) wal_stream_builder: WalReaderStreamBuilder,
+    pub(crate) end_watch_view: EndWatchView,
+    pub(crate) shard: ShardIdentity,
+    pub(crate) pg_version: u32,
+    pub(crate) appname: Option<String>,
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
+    /// Send interpreted WAL to a receiver.
+    /// Stops when an error occurs or the receiver is caught up and there's no active compute.
+    ///
+    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
+    /// convenience.
+    pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
+        let mut wal_position = self.wal_stream_builder.start_pos();
+        let mut wal_decoder =
+            WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
+
+        let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
+        let mut stream = std::pin::pin!(stream);
+
+        let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
+        keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+        keepalive_ticker.reset();
+
+        loop {
+            tokio::select! {
+                // Get some WAL from the stream and then: decode, interpret and send it
+                wal = stream.next() => {
+                    let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
+                        Some(some) => some?,
+                        None => { break; }
+                    };
+
+                    wal_position = wal_end_lsn;
+                    wal_decoder.feed_bytes(&wal);
+
+                    let mut records = Vec::new();
+                    let mut max_next_record_lsn = None;
+                    while let Some((next_record_lsn, recdata)) = wal_decoder
+                        .poll_decode()
+                        .with_context(|| "Failed to decode WAL")?
+                    {
+                        assert!(next_record_lsn.is_aligned());
+                        max_next_record_lsn = Some(next_record_lsn);
+
+                        // Deserialize and interpret WAL record
+                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                            recdata,
+                            &self.shard,
+                            next_record_lsn,
+                            self.pg_version,
+                        )
+                        .with_context(|| "Failed to interpret WAL")?;
+
+                        if !interpreted.is_empty() {
+                            records.push(interpreted);
+                        }
+                    }
+
+                    let mut buf = Vec::new();
+                    records
+                        .ser_into(&mut buf)
+                        .with_context(|| "Failed to serialize interpreted WAL")?;
+
+                    // Reset the keep alive ticker since we are sending something
+                    // over the wire now.
+                    keepalive_ticker.reset();
+
+                    self.pgb
+                        .write_message(&BeMessage::InterpretedWalRecords(InterpretedWalRecordsBody {
+                            streaming_lsn: wal_end_lsn.0,
+                            commit_lsn: available_wal_end_lsn.0,
+                            next_record_lsn: max_next_record_lsn.unwrap_or(Lsn::INVALID).0,
+                            data: buf.as_slice(),
+                        })).await?;
+                }
+
+                // Send a periodic keep alive when the connection has been idle for a while.
+                _ = keepalive_ticker.tick() => {
+                    self.pgb
+                        .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
+                            wal_end: self.end_watch_view.get().0,
+                            timestamp: get_current_timestamp(),
+                            request_reply: true,
+                        }))
+                        .await?;
+                }
+            }
+        }
+
+        // The loop above ends when the receiver is caught up and there's no more WAL to send.
+        Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+            "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
+            self.appname, wal_position,
+        )))
+    }
+}
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index aa65ec851b..1acfcad418 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,12 +5,15 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
+use crate::send_interpreted_wal::InterpretedWalSender;
 use crate::timeline::WalResidentTimeline;
+use crate::wal_reader_stream::WalReaderStreamBuilder;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
+use futures::future::Either;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
@@ -22,6 +25,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
+use utils::postgres_client::PostgresClientProtocol;
 
 use std::cmp::{max, min};
 use std::net::SocketAddr;
@@ -226,7 +230,7 @@ impl WalSenders {
 
     /// Get remote_consistent_lsn reported by the pageserver. Returns None if
     /// client is not pageserver.
-    fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
+    pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
         let shared = self.mutex.lock();
         let slot = shared.get_slot(id);
         match slot.feedback {
@@ -370,6 +374,16 @@ pub struct WalSenderGuard {
     walsenders: Arc<WalSenders>,
 }
 
+impl WalSenderGuard {
+    pub fn id(&self) -> WalSenderId {
+        self.id
+    }
+
+    pub fn walsenders(&self) -> &Arc<WalSenders> {
+        &self.walsenders
+    }
+}
+
 impl Drop for WalSenderGuard {
     fn drop(&mut self) {
         self.walsenders.unregister(self.id);
@@ -440,11 +454,12 @@ impl SafekeeperPostgresHandler {
         }
 
         info!(
-            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
+            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}, protocol={}",
             start_pos,
             end_pos,
             matches!(end_watch, EndWatch::Flush(_)),
-            appname
+            appname,
+            self.protocol(),
         );
 
         // switch to copy
@@ -456,21 +471,51 @@ impl SafekeeperPostgresHandler {
         // not synchronized with sends, so this avoids deadlocks.
         let reader = pgb.split().context("START_REPLICATION split")?;
 
+        let send_fut = match self.protocol() {
+            PostgresClientProtocol::Vanilla => {
+                let sender = WalSender {
+                    pgb,
+                    // should succeed since we're already holding another guard
+                    tli: tli.wal_residence_guard().await?,
+                    appname,
+                    start_pos,
+                    end_pos,
+                    term,
+                    end_watch,
+                    ws_guard: ws_guard.clone(),
+                    wal_reader,
+                    send_buf: vec![0u8; MAX_SEND_SIZE],
+                };
+
+                Either::Left(sender.run())
+            }
+            PostgresClientProtocol::Interpreted => {
+                let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
+                let end_watch_view = end_watch.view();
+                let wal_stream_builder = WalReaderStreamBuilder {
+                    tli: tli.wal_residence_guard().await?,
+                    start_pos,
+                    end_pos,
+                    term,
+                    end_watch,
+                    wal_sender_guard: ws_guard.clone(),
+                };
+
+                let sender = InterpretedWalSender {
+                    pgb,
+                    wal_stream_builder,
+                    end_watch_view,
+                    shard: self.shard.unwrap(),
+                    pg_version,
+                    appname,
+                };
+
+                Either::Right(sender.run())
+            }
+        };
+
         let tli_cancel = tli.cancel.clone();
 
-        let mut sender = WalSender {
-            pgb,
-            // should succeed since we're already holding another guard
-            tli: tli.wal_residence_guard().await?,
-            appname,
-            start_pos,
-            end_pos,
-            term,
-            end_watch,
-            ws_guard: ws_guard.clone(),
-            wal_reader,
-            send_buf: vec![0u8; MAX_SEND_SIZE],
-        };
         let mut reply_reader = ReplyReader {
             reader,
             ws_guard: ws_guard.clone(),
@@ -479,7 +524,7 @@ impl SafekeeperPostgresHandler {
 
         let res = tokio::select! {
             // todo: add read|write .context to these errors
-            r = sender.run() => r,
+            r = send_fut => r,
             r = reply_reader.run() => r,
             _ = tli_cancel.cancelled() => {
                 return Err(CopyStreamHandlerEnd::Cancelled);
@@ -504,16 +549,22 @@ impl SafekeeperPostgresHandler {
     }
 }
 
+/// TODO(vlad): maybe lift this instead
 /// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
 /// given term (recovery by walproposer or peer safekeeper).
-enum EndWatch {
+#[derive(Clone)]
+pub(crate) enum EndWatch {
     Commit(Receiver<Lsn>),
     Flush(Receiver<TermLsn>),
 }
 
 impl EndWatch {
+    pub(crate) fn view(&self) -> EndWatchView {
+        EndWatchView(self.clone())
+    }
+
     /// Get current end of WAL.
-    fn get(&self) -> Lsn {
+    pub(crate) fn get(&self) -> Lsn {
         match self {
             EndWatch::Commit(r) => *r.borrow(),
             EndWatch::Flush(r) => r.borrow().lsn,
@@ -521,15 +572,44 @@ impl EndWatch {
     }
 
     /// Wait for the update.
-    async fn changed(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn changed(&mut self) -> anyhow::Result<()> {
         match self {
             EndWatch::Commit(r) => r.changed().await?,
             EndWatch::Flush(r) => r.changed().await?,
         }
         Ok(())
     }
+
+    pub(crate) async fn wait_for_lsn(
+        &mut self,
+        lsn: Lsn,
+        client_term: Option<Term>,
+    ) -> anyhow::Result<Lsn> {
+        loop {
+            let end_pos = self.get();
+            if end_pos > lsn {
+                return Ok(end_pos);
+            }
+            if let EndWatch::Flush(rx) = &self {
+                let curr_term = rx.borrow().term;
+                if let Some(client_term) = client_term {
+                    if curr_term != client_term {
+                        bail!("term changed: requested {}, now {}", client_term, curr_term);
+                    }
+                }
+            }
+            self.changed().await?;
+        }
+    }
 }
 
+pub(crate) struct EndWatchView(EndWatch);
+
+impl EndWatchView {
+    pub(crate) fn get(&self) -> Lsn {
+        self.0.get()
+    }
+}
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
@@ -566,7 +646,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     ///
     /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
     /// convenience.
-    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+    async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
         loop {
             // Wait for the next portion if it is not there yet, or just
             // update our end of WAL available for sending value, we
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
new file mode 100644
index 0000000000..f8c0c502cd
--- /dev/null
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -0,0 +1,149 @@
+use std::sync::Arc;
+
+use async_stream::try_stream;
+use bytes::Bytes;
+use futures::Stream;
+use postgres_backend::CopyStreamHandlerEnd;
+use std::time::Duration;
+use tokio::time::timeout;
+use utils::lsn::Lsn;
+
+use crate::{
+    safekeeper::Term,
+    send_wal::{EndWatch, WalSenderGuard},
+    timeline::WalResidentTimeline,
+};
+
+pub(crate) struct WalReaderStreamBuilder {
+    pub(crate) tli: WalResidentTimeline,
+    pub(crate) start_pos: Lsn,
+    pub(crate) end_pos: Lsn,
+    pub(crate) term: Option<Term>,
+    pub(crate) end_watch: EndWatch,
+    pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
+}
+
+impl WalReaderStreamBuilder {
+    pub(crate) fn start_pos(&self) -> Lsn {
+        self.start_pos
+    }
+}
+
+pub(crate) struct WalBytes {
+    /// Raw PG WAL
+    pub(crate) wal: Bytes,
+    /// Start LSN of [`Self::wal`]
+    #[allow(dead_code)]
+    pub(crate) wal_start_lsn: Lsn,
+    /// End LSN of [`Self::wal`]
+    pub(crate) wal_end_lsn: Lsn,
+    /// End LSN of WAL available on the safekeeper.
+    ///
+    /// For pagservers this will be commit LSN,
+    /// while for the compute it will be the flush LSN.
+    pub(crate) available_wal_end_lsn: Lsn,
+}
+
+impl WalReaderStreamBuilder {
+    /// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
+    /// The stream terminates when the receiver (pageserver) is fully caught up
+    /// and there's no active computes.
+    pub(crate) async fn build(
+        self,
+        buffer_size: usize,
+    ) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
+        // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
+        // We can make the raw WAL sender use this stream too and remove the duplication.
+        let Self {
+            tli,
+            mut start_pos,
+            mut end_pos,
+            term,
+            mut end_watch,
+            wal_sender_guard,
+        } = self;
+        let mut wal_reader = tli.get_walreader(start_pos).await?;
+        let mut buffer = vec![0; buffer_size];
+
+        const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+
+        Ok(try_stream! {
+            loop {
+                let have_something_to_send = end_pos > start_pos;
+
+                if !have_something_to_send {
+                    // wait for lsn
+                    let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
+                    match res {
+                        Ok(ok) => {
+                            end_pos = ok?;
+                        },
+                        Err(_) => {
+                            if let EndWatch::Commit(_) = end_watch {
+                                if let Some(remote_consistent_lsn) = wal_sender_guard
+                                    .walsenders()
+                                    .get_ws_remote_consistent_lsn(wal_sender_guard.id())
+                                {
+                                    if tli.should_walsender_stop(remote_consistent_lsn).await {
+                                        // Stop streaming if the receivers are caught up and
+                                        // there's no active compute. This causes the loop in
+                                        // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
+                                        // to exit and terminate the WAL stream.
+                                        return;
+                                    }
+                                }
+                            }
+
+                            continue;
+                        }
+                    }
+                }
+
+
+                assert!(
+                    end_pos > start_pos,
+                    "nothing to send after waiting for WAL"
+                );
+
+                // try to send as much as available, capped by the buffer size
+                let mut chunk_end_pos = start_pos + buffer_size as u64;
+                // if we went behind available WAL, back off
+                if chunk_end_pos >= end_pos {
+                    chunk_end_pos = end_pos;
+                } else {
+                    // If sending not up to end pos, round down to page boundary to
+                    // avoid breaking WAL record not at page boundary, as protocol
+                    // demands. See walsender.c (XLogSendPhysical).
+                    chunk_end_pos = chunk_end_pos
+                        .checked_sub(chunk_end_pos.block_offset())
+                        .unwrap();
+                }
+                let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
+                let buffer = &mut buffer[..send_size];
+                let send_size: usize;
+                {
+                    // If uncommitted part is being pulled, check that the term is
+                    // still the expected one.
+                    let _term_guard = if let Some(t) = term {
+                        Some(tli.acquire_term(t).await?)
+                    } else {
+                        None
+                    };
+                    // Read WAL into buffer. send_size can be additionally capped to
+                    // segment boundary here.
+                    send_size = wal_reader.read(buffer).await?
+                };
+                let wal = Bytes::copy_from_slice(&buffer[..send_size]);
+
+                yield WalBytes {
+                    wal,
+                    wal_start_lsn: start_pos,
+                    wal_end_lsn: start_pos + send_size as u64,
+                    available_wal_end_lsn: end_pos
+                };
+
+                start_pos += send_size as u64;
+            }
+        })
+    }
+}
diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index 77e8f2cf17..e965aae5a0 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -15,16 +15,21 @@ from fixtures.neon_fixtures import (
 
 @pytest.mark.timeout(600)
 @pytest.mark.parametrize("shard_count", [1, 8, 32])
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
 def test_sharded_ingest(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     shard_count: int,
+    wal_receiver_protocol: str,
 ):
     """
     Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
     and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case
     (shard_count=1) to the sharded case indicates the overhead of sharding.
     """
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
 
     ROW_COUNT = 100_000_000  # about 7 GB of WAL
 
@@ -50,7 +55,6 @@ def test_sharded_ingest(
     # Start the endpoint.
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
-
     # Ingest data and measure WAL volume and duration.
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
@@ -68,4 +72,48 @@ def test_sharded_ingest(
     wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
     zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
 
+    total_ingested = 0
+    total_records_received = 0
+    ingested_by_ps = []
+    for pageserver in env.pageservers:
+        ingested = pageserver.http_client().get_metric_value(
+            "pageserver_wal_ingest_bytes_received_total"
+        )
+        records_received = pageserver.http_client().get_metric_value(
+            "pageserver_wal_ingest_records_received_total"
+        )
+
+        if ingested is None:
+            ingested = 0
+
+        if records_received is None:
+            records_received = 0
+
+        ingested_by_ps.append(
+            (
+                pageserver.id,
+                {
+                    "ingested": ingested,
+                    "records_received": records_received,
+                },
+            )
+        )
+
+        total_ingested += int(ingested)
+        total_records_received += int(records_received)
+
+    total_ingested_mb = total_ingested / (1024 * 1024)
+    zenbenchmark.record("wal_ingested", total_ingested_mb, "MB", MetricReport.LOWER_IS_BETTER)
+    zenbenchmark.record(
+        "records_received", total_records_received, "records", MetricReport.LOWER_IS_BETTER
+    )
+
+    ingested_by_ps.sort(key=lambda x: x[0])
+    for _, stats in ingested_by_ps:
+        for k in stats:
+            if k != "records_received":
+                stats[k] /= 1024**2
+
+    log.info(f"WAL ingested by each pageserver {ingested_by_ps}")
+
     assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f71e05924a..79fd256304 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -27,7 +27,8 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
 
 
 @skip_in_debug_build("only run with release build")
-def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
     a small number of rows and manually instructs the pageserver to run compaction
@@ -38,8 +39,8 @@ def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
 
     # Effectively disable the page cache to rely only on image layers
     # to shorten reads.
-    neon_env_builder.pageserver_config_override = """
-page_cache_size=10
+    neon_env_builder.pageserver_config_override = f"""
+page_cache_size=10; wal_receiver_protocol='{wal_receiver_protocol}'
 """
 
     env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 23c6fa3a5a..70e71d99cd 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -19,7 +19,14 @@ from fixtures.neon_fixtures import NeonEnvBuilder
         "wal_record_crossing_segment_followed_by_small_one",
     ],
 )
-def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_crafted_wal_end(
+    neon_env_builder: NeonEnvBuilder, wal_type: str, wal_receiver_protocol: str
+):
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
+
     env = neon_env_builder.init_start()
     env.create_branch("test_crafted_wal_end")
     env.pageserver.allowed_errors.extend(
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 7a46f0140c..1d86c353be 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import NeonEnvBuilder, check_restored_datadir_content
 
 
 # Test subtransactions
@@ -9,8 +10,13 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 # maintained in the pageserver, so subtransactions are not very exciting for
 # Neon. They are included in the commit record though and updated in the
 # CLOG.
-def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
-    env = neon_simple_env
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol):
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
+
+    env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 18408b0619..094b10b576 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -622,8 +622,12 @@ async def run_segment_init_failure(env: NeonEnv):
 # Test (injected) failure during WAL segment init.
 # https://github.com/neondatabase/neon/issues/6401
 # https://github.com/neondatabase/neon/issues/6402
-def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_segment_init_failure(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str):
     neon_env_builder.num_safekeepers = 1
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
     env = neon_env_builder.init_start()
 
     asyncio.run(run_segment_init_failure(env))

From 87e4dd23a1d92ad99665b109ba58b7050a934b4d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 25 Nov 2024 18:53:26 +0100
Subject: [PATCH 50/51] proxy: Demote all cplane error replies to info log
 level (#9880)

## Problem

The vast majority of the error/warn logs from cplane are about time or
data transfer quotas exceeded or endpoint-not-found errors and not
operational errors in proxy or cplane.

## Summary of changes

* Demote cplane error replies to info level.
* Raise other errors from warn back to error.
---
 proxy/src/proxy/wake_compute.rs | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 8a672d48dc..4e9206feff 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,9 +1,9 @@
-use tracing::{error, info, warn};
+use tracing::{error, info};
 
 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestContext;
-use crate::control_plane::errors::WakeComputeError;
+use crate::control_plane::errors::{ControlPlaneError, WakeComputeError};
 use crate::control_plane::CachedNodeInfo;
 use crate::error::ReportableError;
 use crate::metrics::{
@@ -11,6 +11,18 @@ use crate::metrics::{
 };
 use crate::proxy::retry::{retry_after, should_retry};
 
+// Use macro to retain original callsite.
+macro_rules! log_wake_compute_error {
+    (error = ?$error:expr, $num_retries:expr, retriable = $retriable:literal) => {
+        match $error {
+            WakeComputeError::ControlPlane(ControlPlaneError::Message(_)) => {
+                info!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node")
+            }
+            _ => error!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node"),
+        }
+    };
+}
+
 pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &RequestContext,
@@ -20,7 +32,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     loop {
         match api.wake_compute(ctx).await {
             Err(e) if !should_retry(&e, *num_retries, config) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                log_wake_compute_error!(error = ?e, num_retries, retriable = false);
                 report_error(&e, false);
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
@@ -32,7 +44,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
                 return Err(e);
             }
             Err(e) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+                log_wake_compute_error!(error = ?e, num_retries, retriable = true);
                 report_error(&e, true);
             }
             Ok(n) => {

From 7404887b810403768a950006335877aa6a966c8d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 25 Nov 2024 20:35:32 +0100
Subject: [PATCH 51/51] proxy: Demote errors from cplane request routines to
 debug (#9886)

## Problem

Any errors from these async blocks are unconditionally logged at error
level
even though we already handle such errors based on context.

## Summary of changes

* Log raw errors from creating and executing cplane requests at debug
level.
* Inline macro calls to retain the correct callsite.
---
 proxy/src/control_plane/client/mock.rs |  2 +-
 proxy/src/control_plane/client/neon.rs | 13 ++++++-------
 proxy/src/error.rs                     |  6 ------
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 500acad50f..9537d717a1 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -114,7 +114,7 @@ impl MockControlPlane {
 
             Ok((secret, allowed_ips))
         }
-        .map_err(crate::error::log_error::<GetAuthInfoError>)
+        .inspect_err(|e: &GetAuthInfoError| tracing::error!("{e}"))
         .instrument(info_span!("postgres", url = self.endpoint.as_str()))
         .await?;
         Ok(AuthInfo {
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 757ea6720a..2cad981d01 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -134,8 +134,8 @@ impl NeonControlPlaneClient {
                 project_id: body.project_id,
             })
         }
-        .map_err(crate::error::log_error)
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_auth_info"))
         .await
     }
 
@@ -193,8 +193,8 @@ impl NeonControlPlaneClient {
 
             Ok(rules)
         }
-        .map_err(crate::error::log_error)
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_endpoint_jwks"))
         .await
     }
 
@@ -252,9 +252,8 @@ impl NeonControlPlaneClient {
 
             Ok(node)
         }
-        .map_err(crate::error::log_error)
-        // TODO: redo this span stuff
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_wake_compute"))
         .await
     }
 }
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 7b693a7418..2221aac407 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -10,12 +10,6 @@ pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Err
     io::Error::new(io::ErrorKind::Other, e)
 }
 
-/// A small combinator for pluggable error logging.
-pub(crate) fn log_error<E: fmt::Display>(e: E) -> E {
-    tracing::error!("{e}");
-    e
-}
-
 /// Marks errors that may be safely shown to a client.
 /// This trait can be seen as a specialized version of [`ToString`].
 ///