safekeeper: add term_bump endpoint.

When walproposer observes now higher term it restarts instead of crashing whole compute with PANIC; this avoids compute crash after term_bump call. After successfull election we're still checking last_log_term of the highest given vote to ensure basebackup is good, and PANIC otherwise. It will be used for migration per 035-safekeeper-dynamic-membership-change.md and https://github.com/neondatabase/docs/pull/21 ref https://github.com/neondatabase/neon/issues/8700
Reorder sk routes a bit.
2026-05-25 00:50:36 +00:00 · 2024-08-19 14:47:39 +03:00 · 2024-08-19 14:47:32 +03:00
141 changed files with 1695 additions and 2356 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,30 +23,10 @@ platforms = [
 ]

 [final-excludes]
-workspace-members = [
-    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-    # from depending on workspace-hack because most of the dependencies are not used.
-    "vm_monitor",
-    # All of these exist in libs and are not usually built independently.
-    # Putting workspace hack there adds a bottleneck for cargo builds.
-    "compute_api",
-    "consumption_metrics",
-    "desim",
-    "metrics",
-    "pageserver_api",
-    "postgres_backend",
-    "postgres_connection",
-    "postgres_ffi",
-    "pq_proto",
-    "remote_storage",
-    "safekeeper_api",
-    "tenant_size_model",
-    "tracing-utils",
-    "utils",
-    "wal_craft",
-    "walproposer",
-]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]

 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -43,7 +43,7 @@ inputs:
  pg_version:
    description: 'Postgres version to use for tests'
    required: false
-    default: 'v16'
+    default: 'v14'
  benchmark_durations:
    description: 'benchmark durations JSON'
    required: false
@@ -169,8 +169,10 @@ runs:
          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
        fi

-        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
+        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
+          cov_prefix=()
        else
          cov_prefix=()
        fi
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -48,8 +48,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  

-    - uses: actions/checkout@v4
-
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,16 +94,11 @@ jobs:
      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
      # corresponding Cargo.toml files for their descriptions.
      - name: Set env variables
-        env:
-          ARCH: ${{ inputs.arch }}
        run: |
          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
            CARGO_FLAGS="--locked"
-          elif [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=""
-            CARGO_FLAGS="--locked"
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
@@ -163,8 +158,6 @@ jobs:
      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
      - name: Install rust binaries
-        env:
-          ARCH: ${{ inputs.arch }}
        run: |
          # Install target binaries
          mkdir -p /tmp/neon/bin/
@@ -179,7 +172,7 @@ jobs:
          done

          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" ]]; then
            # Keep bloated coverage data files away from the rest of the artifact
            mkdir -p /tmp/coverage/

@@ -250,8 +243,8 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    # Don't run regression tests on debug arm64 builds
-    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
+    # Run test on x64 only
+    if: inputs.arch == 'x64'
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        arch: [ x64, arm64 ]
+        arch: [ x64 ]
        # Do not build or run tests in debug for release branches
        build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
        include:
@@ -280,7 +280,6 @@ jobs:
          save_perf_report: ${{ github.ref_name == 'main' }}
          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
-          pg_version: v16
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -986,10 +985,10 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
@@ -999,14 +998,14 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f deployStorage=false \
@@ -1016,7 +1015,7 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f branch=main \
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1208,6 +1208,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -1320,6 +1321,7 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -1668,13 +1670,14 @@ dependencies = [
 "smallvec",
 "tracing",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
 name = "diesel"
-version = "2.2.3"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
+checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
@@ -3144,6 +3147,7 @@ dependencies = [
 "rand 0.8.5",
 "rand_distr",
 "twox-hash",
+ "workspace_hack",
 ]

 [[package]]
@@ -3787,6 +3791,7 @@ dependencies = [
 "strum_macros",
 "thiserror",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -4188,6 +4193,7 @@ dependencies = [
 "tokio-rustls 0.25.0",
 "tokio-util",
 "tracing",
+ "workspace_hack",
 ]

 [[package]]
@@ -4200,6 +4206,7 @@ dependencies = [
 "postgres",
 "tokio-postgres",
 "url",
+ "workspace_hack",
 ]

 [[package]]
@@ -4222,6 +4229,7 @@ dependencies = [
 "serde",
 "thiserror",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -4259,6 +4267,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tracing",
+ "workspace_hack",
 ]

 [[package]]
@@ -4823,6 +4832,7 @@ dependencies = [
 "toml_edit 0.19.10",
 "tracing",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -5347,6 +5357,7 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -5590,12 +5601,11 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.125"
+version = "1.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
+checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
 dependencies = [
 "itoa",
- "memchr",
 "ryu",
 "serde",
 ]
@@ -6183,6 +6193,7 @@ dependencies = [
 "anyhow",
 "serde",
 "serde_json",
+ "workspace_hack",
 ]

 [[package]]
@@ -6783,6 +6794,7 @@ dependencies = [
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
+ "workspace_hack",
 ]

 [[package]]
@@ -7000,6 +7012,7 @@ dependencies = [
 "url",
 "uuid",
 "walkdir",
+ "workspace_hack",
 ]

 [[package]]
@@ -7078,6 +7091,7 @@ dependencies = [
 "postgres_ffi",
 "regex",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -7098,6 +7112,7 @@ dependencies = [
 "bindgen",
 "postgres_ffi",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -7654,6 +7669,8 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
+ "toml_datetime",
+ "toml_edit 0.19.10",
 "tonic",
 "tower",
 "tracing",
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.


 #### Running neon database
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: &str = "16";
+const DEFAULT_PG_VERSION: &str = "15";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 //
 // This data structures represents neon_local CLI config
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -217,7 +217,7 @@ impl StorageController {
        Ok(exitcode.success())
    }

-    /// Create our database if it doesn't exist
+    /// Create our database if it doesn't exist, and run migrations.
    ///
    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
@@ -382,6 +382,7 @@ impl StorageController {
            )
            .await?;

+            // Run migrations on every startup, in case something changed.
            self.setup_database(postgres_port).await?;
        }

@@ -453,11 +454,6 @@ impl StorageController {
            let jwt_token =
                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
            args.push(format!("--jwt-token={jwt_token}"));
-
-            let peer_claims = Claims::new(None, Scope::Admin);
-            let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
-                .expect("failed to generate jwt token");
-            args.push(format!("--peer-jwt-token={peer_jwt_token}"));
        }

        if let Some(public_key) = &self.public_key {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -147,9 +147,9 @@ enum Command {
        #[arg(long)]
        threshold: humantime::Duration,
    },
-    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
+    // Drain a set of specified pageservers by moving the primary attachments to pageservers
    // outside of the specified set.
-    BulkMigrate {
+    Drain {
        // Set of pageserver node ids to drain.
        #[arg(long)]
        nodes: Vec<NodeId>,
@@ -163,34 +163,6 @@ enum Command {
        #[arg(long)]
        dry_run: Option<bool>,
    },
-    /// Start draining the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartDrain {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel draining the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelDrain {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
-    /// Start filling the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartFill {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel filling the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelFill {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
 }

 #[derive(Parser)]
@@ -277,34 +249,6 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

-async fn wait_for_scheduling_policy<F>(
-    client: Client,
-    node_id: NodeId,
-    timeout: Duration,
-    f: F,
-) -> anyhow::Result<NodeSchedulingPolicy>
-where
-    F: Fn(NodeSchedulingPolicy) -> bool,
-{
-    let waiter = tokio::time::timeout(timeout, async move {
-        loop {
-            let node = client
-                .dispatch::<(), NodeDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/node/{node_id}"),
-                    None,
-                )
-                .await?;
-
-            if f(node.scheduling) {
-                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
-            }
-        }
-    });
-
-    Ok(waiter.await??)
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -684,7 +628,7 @@ async fn main() -> anyhow::Result<()> {
                })
                .await?;
        }
-        Command::BulkMigrate {
+        Command::Drain {
            nodes,
            concurrency,
            max_shards,
@@ -713,7 +657,7 @@ async fn main() -> anyhow::Result<()> {
            }

            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
+                anyhow::bail!("Drain requested for node which doesn't exist.")
            }

            node_to_fill_descs.retain(|desc| {
@@ -725,7 +669,7 @@ async fn main() -> anyhow::Result<()> {
            });

            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to migrate to")
+                anyhow::bail!("There are no nodes to drain to")
            }

            // Set the node scheduling policy to draining for the nodes which
@@ -746,7 +690,7 @@ async fn main() -> anyhow::Result<()> {
                    .await?;
            }

-            // Perform the migration: move each tenant shard scheduled on a node to
+            // Perform the drain: move each tenant shard scheduled on a node to
            // be drained to a node which is being filled. A simple round robin
            // strategy is used to pick the new node.
            let tenants = storcon_client
@@ -759,13 +703,13 @@ async fn main() -> anyhow::Result<()> {

            let mut selected_node_idx = 0;

-            struct MigrationMove {
+            struct DrainMove {
                tenant_shard_id: TenantShardId,
                from: NodeId,
                to: NodeId,
            }

-            let mut moves: Vec<MigrationMove> = Vec::new();
+            let mut moves: Vec<DrainMove> = Vec::new();

            let shards = tenants
                .into_iter()
@@ -795,7 +739,7 @@ async fn main() -> anyhow::Result<()> {
                    continue;
                }

-                moves.push(MigrationMove {
+                moves.push(DrainMove {
                    tenant_shard_id: shard.tenant_shard_id,
                    from: shard
                        .node_attached
@@ -872,67 +816,6 @@ async fn main() -> anyhow::Result<()> {
                failure
            );
        }
-        Command::StartDrain { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-            println!("Drain started for {node_id}");
-        }
-        Command::CancelDrain { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active | PauseForRestart)
-                })
-                .await?;
-
-            println!(
-                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
-        Command::StartFill { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
-                .await?;
-
-            println!("Fill started for {node_id}");
-        }
-        Command::CancelFill { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/fill"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active)
-                })
-                .await?;
-
-            println!(
-                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
    }

    Ok(())
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.

 Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.

 This problem is not yet very acutely felt in storage controller managed pageservers since
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,21 +21,30 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.

    ```shell
-    git checkout -b my-branch-15 REL_15_STABLE_neon
+    git checkout -b my-branch REL_15_STABLE_neon
    ```

-1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
+1. Tag the last commit on the stable branch you are updating.

-1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.

    ```shell
    git fetch upstream REL_15_4
-    git merge REL_15_4
+    git rebase REL_15_4
    ```

-    In the commit message of the merge commit, mention if there were
-    any non-trivial conflicts or other issues.
-
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.

@@ -48,7 +57,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15
+    git push origin my-branch
    ```

 1. Clone the Neon repository if you have not done so already.
@@ -65,7 +74,7 @@ branch.
 1. Update the Git submodule.

    ```shell
-    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
+    git submodule set-branch --branch my-branch vendor/postgres-v15
    git submodule update --remote vendor/postgres-v15
    ```

@@ -80,12 +89,14 @@ minor Postgres release.

 1. Create a pull request, and wait for CI to go green.

-1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
+1. Force push the rebased Postgres branches into the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15:REL_15_STABLE_neon
+    git push --force origin my-branch:REL_15_STABLE_neon
    ```

+    It may require disabling various branch protections.
+
 1. Update your Neon PR to point at the branches.

    ```shell
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,3 +14,5 @@ regex.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
+
+workspace_hack.workspace = true
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,8 +6,10 @@ license = "Apache-2.0"

 [dependencies]
 anyhow.workspace = true
-chrono = { workspace = true, features = ["serde"] }
+chrono.workspace = true
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,3 +14,5 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
+
+workspace_hack.workspace = true
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,6 +12,8 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true

+workspace_hack.workspace = true
+
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,9 +21,11 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono = { workspace = true, features = ["serde"] }
+chrono.workspace = true
 itertools.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -8,7 +8,6 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};

-use crate::models::PageserverUtilization;
 use crate::{
    models::{ShardParameters, TenantConfig},
    shard::{ShardStripeSize, TenantShardId},
@@ -141,11 +140,23 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-#[derive(Serialize, Clone, Debug)]
+/// Utilisation score indicating how good a candidate a pageserver
+/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
+/// Lower values are better.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
+pub struct UtilizationScore(pub u64);
+
+impl UtilizationScore {
+    pub fn worst() -> Self {
+        UtilizationScore(u64::MAX)
+    }
+}
+
+#[derive(Serialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(PageserverUtilization),
+    Active(UtilizationScore),
    // Node is warming up, but we expect it to become available soon. Covers
    // the time span between the re-attach response being composed on the storage controller
    // and the first successful heartbeat after the processing of the re-attach response
@@ -184,9 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
        match val {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
-            NodeAvailabilityWrapper::Active => {
-                NodeAvailability::Active(PageserverUtilization::full())
-            }
+            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -348,7 +348,7 @@ impl AuxFilePolicy {

    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
    pub fn default_tenant_config() -> Self {
-        Self::V2
+        Self::V1
    }
 }

--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
    pub max_shard_count: u32,

    /// Cached result of [`Self::score`]
-    pub utilization_score: Option<u64>,
+    pub utilization_score: u64,

    /// When was this snapshot captured, pageserver local time.
    ///
@@ -50,8 +50,6 @@ fn unity_percent() -> Percent {
    Percent::new(0).unwrap()
 }

-pub type RawScore = u64;
-
 impl PageserverUtilization {
    const UTILIZATION_FULL: u64 = 1000000;

@@ -64,7 +62,7 @@ impl PageserverUtilization {
    /// - Negative values are forbidden
    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
    ///   layer eviction.
-    pub fn score(&self) -> RawScore {
+    pub fn score(&self) -> u64 {
        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
            * self.disk_usable_pct.get() as u64)
            / 100;
@@ -76,30 +74,8 @@ impl PageserverUtilization {
        std::cmp::max(disk_utilization_score, shard_utilization_score)
    }

-    pub fn cached_score(&mut self) -> RawScore {
-        match self.utilization_score {
-            None => {
-                let s = self.score();
-                self.utilization_score = Some(s);
-                s
-            }
-            Some(s) => s,
-        }
-    }
-
-    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
-    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
-    pub fn is_overloaded(score: RawScore) -> bool {
-        score >= Self::UTILIZATION_FULL
-    }
-
-    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
-        if self.shard_count < shard_count {
-            self.shard_count = shard_count;
-
-            // Dirty cache: this will be calculated next time someone retrives the score
-            self.utilization_score = None;
-        }
+    pub fn refresh_score(&mut self) {
+        self.utilization_score = self.score();
    }

    /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -112,38 +88,7 @@ impl PageserverUtilization {
            disk_usable_pct: Percent::new(100).unwrap(),
            shard_count: 1,
            max_shard_count: 1,
-            utilization_score: Some(Self::UTILIZATION_FULL),
-            captured_at: serde_system_time::SystemTime(SystemTime::now()),
-        }
-    }
-}
-
-/// Test helper
-pub mod test_utilization {
-    use super::PageserverUtilization;
-    use std::time::SystemTime;
-    use utils::{
-        serde_percent::Percent,
-        serde_system_time::{self},
-    };
-
-    // Parameters of the imaginary node used for test utilization instances
-    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
-    const TEST_SHARDS_MAX: u32 = 1000;
-
-    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
-    /// not abuse this function from non-test code.
-    ///
-    /// Emulates a node with a 1000 shard limit and a 1TB disk.
-    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
-        PageserverUtilization {
-            disk_usage_bytes: disk_wanted_bytes,
-            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
-            disk_wanted_bytes,
-            disk_usable_pct: Percent::new(100).unwrap(),
-            shard_count,
-            max_shard_count: TEST_SHARDS_MAX,
-            utilization_score: None,
+            utilization_score: Self::UTILIZATION_FULL,
            captured_at: serde_system_time::SystemTime(SystemTime::now()),
        }
    }
@@ -175,7 +120,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            disk_wanted_bytes: u64::MAX,
-            utilization_score: Some(13),
+            utilization_score: 13,
            disk_usable_pct: Percent::new(90).unwrap(),
            shard_count: 100,
            max_shard_count: 200,
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,6 +18,7 @@ tokio-rustls.workspace = true
 tracing.workspace = true

 pq_proto.workspace = true
+workspace_hack.workspace = true

 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,5 +11,7 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,6 +19,8 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,6 +14,8 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,7 +11,9 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio = { workspace = true, features = ["io-util"] }
+tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-
+workspace_hack.workspace = true
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,4 +46,3 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
-tokio = { workspace = true, features = ["test-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -383,48 +383,6 @@ impl RemoteStorage for AzureBlobStorage {
        }
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        let kind = RequestKind::Head;
-        let _permit = self.permit(kind, cancel).await?;
-
-        let started_at = start_measuring_requests(kind);
-
-        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
-        let properties_future = blob_client.get_properties().into_future();
-
-        let properties_future = tokio::time::timeout(self.timeout, properties_future);
-
-        let res = tokio::select! {
-            res = properties_future => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        if let Ok(inner) = &res {
-            // do not incl. timeouts as errors in metrics but cancellations
-            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, inner, started_at);
-        }
-
-        let data = match res {
-            Ok(Ok(data)) => Ok(data),
-            Ok(Err(sdk)) => Err(to_download_error(sdk)),
-            Err(_timeout) => Err(DownloadError::Timeout),
-        }?;
-
-        let properties = data.blob.properties;
-        Ok(ListingObject {
-            key: key.to_owned(),
-            last_modified: SystemTime::from(properties.last_modified),
-            size: properties.content_length,
-        })
-    }
-
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -150,7 +150,7 @@ pub enum ListingMode {
    NoDelimiter,
 }

-#[derive(PartialEq, Eq, Debug, Clone)]
+#[derive(PartialEq, Eq, Debug)]
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
@@ -215,13 +215,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
        Ok(combined)
    }

-    /// Obtain metadata information about an object.
-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError>;
-
    /// Streams the local file contents into remote into the remote storage entry.
    ///
    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -370,20 +363,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    // See [`RemoteStorage::head_object`].
-    pub async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.head_object(key, cancel).await,
-            Self::AwsS3(s) => s.head_object(key, cancel).await,
-            Self::AzureBlob(s) => s.head_object(key, cancel).await,
-            Self::Unreliable(s) => s.head_object(key, cancel).await,
-        }
-    }
-
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -619,7 +598,6 @@ impl ConcurrencyLimiter {
            RequestKind::Delete => &self.write,
            RequestKind::Copy => &self.write,
            RequestKind::TimeTravel => &self.write,
-            RequestKind::Head => &self.read,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,20 +445,6 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        _cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        let target_file_path = key.with_base(&self.storage_root);
-        let metadata = file_metadata(&target_file_path).await?;
-        Ok(ListingObject {
-            key: key.clone(),
-            last_modified: metadata.modified()?,
-            size: metadata.len(),
-        })
-    }
-
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -13,7 +13,6 @@ pub(crate) enum RequestKind {
    List = 3,
    Copy = 4,
    TimeTravel = 5,
-    Head = 6,
 }

 use scopeguard::ScopeGuard;
@@ -28,7 +27,6 @@ impl RequestKind {
            List => "list_objects",
            Copy => "copy_object",
            TimeTravel => "time_travel_recover",
-            Head => "head_object",
        }
    }
    const fn as_index(&self) -> usize {
@@ -36,8 +34,7 @@ impl RequestKind {
    }
 }

-const REQUEST_KIND_COUNT: usize = 7;
-pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
+pub(crate) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
    pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -46,8 +43,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
-        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
+        let arr = std::array::from_fn::<C, 6, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -23,7 +23,7 @@ use aws_config::{
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
-    operation::{get_object::GetObjectError, head_object::HeadObjectError},
+    operation::get_object::GetObjectError,
    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
    Client,
 };
@@ -604,78 +604,6 @@ impl RemoteStorage for S3Bucket {
        }
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<ListingObject, DownloadError> {
-        let kind = RequestKind::Head;
-        let _permit = self.permit(kind, cancel).await?;
-
-        let started_at = start_measuring_requests(kind);
-
-        let head_future = self
-            .client
-            .head_object()
-            .bucket(self.bucket_name())
-            .key(self.relative_path_to_s3_object(key))
-            .send();
-
-        let head_future = tokio::time::timeout(self.timeout, head_future);
-
-        let res = tokio::select! {
-            res = head_future => res,
-            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
-        };
-
-        let res = res.map_err(|_e| DownloadError::Timeout)?;
-
-        // do not incl. timeouts as errors in metrics but cancellations
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        let data = match res {
-            Ok(object_output) => object_output,
-            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
-                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
-                // an error: we expect to sometimes fetch an object and find it missing,
-                // e.g. when probing for timeline indices.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Ok,
-                    started_at,
-                );
-                return Err(DownloadError::NotFound);
-            }
-            Err(e) => {
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Err,
-                    started_at,
-                );
-
-                return Err(DownloadError::Other(
-                    anyhow::Error::new(e).context("s3 head object"),
-                ));
-            }
-        };
-
-        let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
-            return Err(DownloadError::Other(anyhow!(
-                "head_object doesn't contain last_modified or content_length"
-            )))?;
-        };
-        Ok(ListingObject {
-            key: key.to_owned(),
-            last_modified: SystemTime::try_from(last_modified).map_err(|e| {
-                DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
-            })?,
-            size: size as u64,
-        })
-    }
-
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -30,7 +30,6 @@ pub struct UnreliableWrapper {
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
    ListPrefixes(Option<RemotePath>),
-    HeadObject(RemotePath),
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
@@ -138,16 +137,6 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list(prefix, mode, max_keys, cancel).await
    }

-    async fn head_object(
-        &self,
-        key: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<crate::ListingObject, DownloadError> {
-        self.attempt(RemoteOp::HeadObject(key.clone()))
-            .map_err(DownloadError::Other)?;
-        self.inner.head_object(key, cancel).await
-    }
-
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,3 +9,5 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -60,3 +60,16 @@ pub struct TimelineCopyRequest {
    pub target_timeline_id: TimelineId,
    pub until_lsn: Lsn,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpRequest {
+    /// bump to
+    pub term: Option<u64>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpResponse {
+    // before the request
+    pub previous_term: u64,
+    pub current_term: u64,
+}
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,3 +9,5 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,3 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -39,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit = { workspace = true, features = ["serde"] }
+toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -54,6 +54,7 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
+workspace_hack.workspace = true

 const_format.workspace = true

@@ -70,7 +71,6 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
-tokio = { workspace = true, features = ["test-util"] }

 [[bench]]
 name = "benchmarks"
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,6 +9,8 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true

+workspace_hack.workspace = true
+
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -95,7 +95,6 @@ fn main() -> anyhow::Result<()> {
        .allowlist_var("ERROR")
        .allowlist_var("FATAL")
        .allowlist_var("PANIC")
-        .allowlist_var("PG_VERSION_NUM")
        .allowlist_var("WPEVENT")
        .allowlist_var("WL_LATCH_SET")
        .allowlist_var("WL_SOCKET_READABLE")
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -282,11 +282,7 @@ mod tests {
    use std::cell::UnsafeCell;
    use utils::id::TenantTimelineId;

-    use crate::{
-        api_bindings::Level,
-        bindings::{NeonWALReadResult, PG_VERSION_NUM},
-        walproposer::Wrapper,
-    };
+    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};

    use super::ApiImpl;

@@ -493,79 +489,41 @@ mod tests {

        let (sender, receiver) = sync_channel(1);

-        // Messages definitions are at walproposer.h
-        // xxx: it would be better to extract them from safekeeper crate and
-        // use serialization/deserialization here.
-        let greeting_tag = (b'g' as u64).to_ne_bytes();
-        let proto_version = 2_u32.to_ne_bytes();
-        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let system_id = 0_u64.to_ne_bytes();
-        let tenant_id = ttid.tenant_id.as_arr();
-        let timeline_id = ttid.timeline_id.as_arr();
-        let pg_tli = 1_u32.to_ne_bytes();
-        let wal_seg_size = 16777216_u32.to_ne_bytes();
-        let proposer_greeting = [
-            greeting_tag.as_slice(),
-            proto_version.as_slice(),
-            pg_version.as_slice(),
-            proposer_id.as_slice(),
-            system_id.as_slice(),
-            tenant_id.as_slice(),
-            timeline_id.as_slice(),
-            pg_tli.as_slice(),
-            wal_seg_size.as_slice(),
-        ]
-        .concat();
-
-        let voting_tag = (b'v' as u64).to_ne_bytes();
-        let vote_request_term = 3_u64.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let vote_request = [
-            voting_tag.as_slice(),
-            vote_request_term.as_slice(),
-            proposer_id.as_slice(),
-        ]
-        .concat();
-
-        let acceptor_greeting_term = 2_u64.to_ne_bytes();
-        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
-        let acceptor_greeting = [
-            greeting_tag.as_slice(),
-            acceptor_greeting_term.as_slice(),
-            acceptor_greeting_node_id.as_slice(),
-        ]
-        .concat();
-
-        let vote_response_term = 3_u64.to_ne_bytes();
-        let vote_given = 1_u64.to_ne_bytes();
-        let flush_lsn = 0x539_u64.to_ne_bytes();
-        let truncate_lsn = 0x539_u64.to_ne_bytes();
-        let th_len = 1_u32.to_ne_bytes();
-        let th_term = 2_u64.to_ne_bytes();
-        let th_lsn = 0x539_u64.to_ne_bytes();
-        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
-        let vote_response = [
-            voting_tag.as_slice(),
-            vote_response_term.as_slice(),
-            vote_given.as_slice(),
-            flush_lsn.as_slice(),
-            truncate_lsn.as_slice(),
-            th_len.as_slice(),
-            th_term.as_slice(),
-            th_lsn.as_slice(),
-            timeline_start_lsn.as_slice(),
-        ]
-        .concat();
-
        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
            wait_events: Cell::new(WaitEventsData {
                sk: std::ptr::null_mut(),
                event_mask: 0,
            }),
-            expected_messages: vec![proposer_greeting, vote_request],
+            expected_messages: vec![
+                // TODO: When updating Postgres versions, this test will cause
+                // problems. Postgres version in message needs updating.
+                //
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
+                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
+                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
+                ],
+                // VoteRequest(VoteRequest { term: 3 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0,
+                ],
+            ],
            expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![acceptor_greeting, vote_response],
+            safekeeper_replies: vec![
+                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                ],
+                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
+                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
+                ],
+            ],
            replies_ptr: AtomicUsize::new(0),
            sync_channel: sender,
            shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -10,7 +10,6 @@ use pageserver::{
    page_cache,
    repository::Value,
    task_mgr::TaskKind,
-    tenant::storage_layer::inmemory_layer::SerializedBatch,
    tenant::storage_layer::InMemoryLayer,
    virtual_file,
 };
@@ -68,16 +67,12 @@ async fn ingest(
    let layer =
        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;

-    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
-    let data_ser_size = data.serialized_size().unwrap() as usize;
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
    let ctx = RequestContext::new(
        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
        pageserver::context::DownloadBehavior::Download,
    );

-    const BATCH_SIZE: usize = 16;
-    let mut batch = Vec::new();
-
    for i in 0..put_count {
        lsn += put_size as u64;

@@ -100,17 +95,7 @@ async fn ingest(
            }
        }

-        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
-        if batch.len() >= BATCH_SIZE {
-            let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch);
-            layer.put_batch(serialized, &ctx).await?;
-        }
-    }
-    if !batch.is_empty() {
-        let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch);
-        layer.put_batch(serialized, &ctx).await?;
+        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1706,6 +1706,11 @@ async fn timeline_compact_handler(
        flags |= CompactFlags::ForceImageLayerCreation;
    }
    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
+        if !cfg!(feature = "testing") {
+            return Err(ApiError::InternalServerError(anyhow!(
+                "enhanced_gc_bottom_most_compaction is only available in testing mode"
+            )));
+        }
        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
    }
    let wait_until_uploaded =
@@ -2937,7 +2942,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| api_handler(r, timeline_compact_handler),
+            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,10 +1,15 @@
 use std::{num::NonZeroUsize, sync::Arc};

+use crate::tenant::ephemeral_file;
+
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
+    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct { max_concurrency: NonZeroUsize },
+    Direct {
+        max_concurrency: NonZeroUsize,
+    },
 }

 impl Default for L0FlushConfig {
@@ -20,12 +25,14 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);

 pub enum Inner {
+    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }

 impl L0FlushGlobalState {
    pub fn new(config: L0FlushConfig) -> Self {
        match config {
+            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
            L0FlushConfig::Direct { max_concurrency } => {
                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                Self(Arc::new(Inner::Direct { semaphore }))
@@ -37,3 +44,13 @@ impl L0FlushGlobalState {
        &self.0
    }
 }
+
+impl L0FlushConfig {
+    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
+        use L0FlushConfig::*;
+        match self {
+            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
+            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
+        }
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -88,8 +88,6 @@ pub async fn shutdown_pageserver(
 ) {
    use std::time::Duration;

-    let started_at = std::time::Instant::now();
-
    // If the orderly shutdown below takes too long, we still want to make
    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
    //
@@ -243,10 +241,7 @@ pub async fn shutdown_pageserver(
    walredo_extraordinary_shutdown_thread.join().unwrap();
    info!("walredo_extraordinary_shutdown_thread done");

-    info!(
-        elapsed_ms = started_at.elapsed().as_millis(),
-        "Shut down successfully completed"
-    );
+    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,23 +1803,6 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_utilization_score",
-        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_secondary_heatmap_total_size",
-        "The total size in bytes of all layers in the most recently downloaded heatmap.",
-        &["tenant_id", "shard_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1870,64 +1853,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub struct BackgroundLoopSemaphoreMetrics {
-    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
-    durations: EnumMap<BackgroundLoopKind, Counter>,
-}
-
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
-    || {
-        let counters = register_int_counter_pair_vec!(
-            "pageserver_background_loop_semaphore_wait_start_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls started",
-            "pageserver_background_loop_semaphore_wait_finish_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-            &["task"],
-        )
-        .unwrap();
-
-        let durations = register_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_duration_seconds",
-            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
-            &["task"],
-        )
-        .unwrap();
-
-        BackgroundLoopSemaphoreMetrics {
-            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-                counters.with_label_values(&[kind.into()])
-            })),
-            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-                durations.with_label_values(&[kind.into()])
-            })),
-        }
-    },
-);
-
-impl BackgroundLoopSemaphoreMetrics {
-    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
-        struct Record<'a> {
-            metrics: &'a BackgroundLoopSemaphoreMetrics,
-            task: BackgroundLoopKind,
-            _counter_guard: metrics::IntCounterPairGuard,
-            start: Instant,
-        }
-        impl Drop for Record<'_> {
-            fn drop(&mut self) {
-                let elapsed = self.start.elapsed().as_secs_f64();
-                self.metrics.durations[self.task].inc_by(elapsed);
-            }
-        }
-        Record {
-            metrics: self,
-            task,
-            _counter_guard: self.counters[task].guard(),
-            start: Instant::now(),
-        }
-    }
-}
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_background_loop_semaphore_wait_start_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls started",
+        "pageserver_background_loop_semaphore_wait_finish_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+        &["task"],
+    )
+    .unwrap()
+});

 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
@@ -2609,7 +2544,6 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
-use crate::tenant::tasks::BackgroundLoopKind;

 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,11 +15,12 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -36,6 +37,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
+use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -172,7 +174,6 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
-            pending_bytes: 0,
            lsn,
        }
    }
@@ -726,17 +727,7 @@ impl Timeline {
    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
        let current_policy = self.last_aux_file_policy.load();
        match current_policy {
-            Some(AuxFilePolicy::V1) => {
-                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
-                self.list_aux_files_v1(lsn, ctx).await
-            }
-            None => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                if !res.is_empty() {
-                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
-                }
-                Ok(res)
-            }
+            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
            Some(AuxFilePolicy::CrossValidation) => {
                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -1031,33 +1022,21 @@ pub struct DatadirModification<'a> {
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
-
-    /// An **approximation** of how large our EphemeralFile write will be when committed.
-    pending_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
-    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
-    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
-    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
-    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
-
    /// Get the current lsn
    pub(crate) fn get_lsn(&self) -> Lsn {
        self.lsn
    }

-    pub(crate) fn approx_pending_bytes(&self) -> usize {
-        self.pending_bytes
-    }
-
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1597,7 +1576,6 @@ impl<'a> DatadirModification<'a> {
                if aux_files_key_v1.is_empty() {
                    None
                } else {
-                    warn!("this timeline is using deprecated aux file policy V1");
                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                    Some(AuxFilePolicy::V1)
                }
@@ -1791,25 +1769,21 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
-            let mut write_batch = Vec::new();
-            for (lsn, value_ser_size, value) in values {
+            for (lsn, value) in values {
                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
+                    writer.put(key, lsn, &value, ctx).await?;
                } else {
-                    retained_pending_updates.entry(key).or_default().push((
-                        lsn,
-                        value_ser_size,
-                        value,
-                    ));
+                    retained_pending_updates
+                        .entry(key)
+                        .or_default()
+                        .push((lsn, value));
                }
            }
-            writer.put_batch(write_batch, ctx).await?;
        }

        self.pending_updates = retained_pending_updates;
-        self.pending_bytes = 0;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1835,20 +1809,17 @@ impl<'a> DatadirModification<'a> {
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
-                .pending_updates
-                .drain()
-                .flat_map(|(key, values)| {
-                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        (key.to_compact(), lsn, val_ser_size, value)
-                    })
-                })
-                .collect::<Vec<_>>();
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
+                self.pending_updates
+                    .drain()
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
+                VecMapOrdering::GreaterOrEqual,
+            );

-            writer.put_batch(batch, ctx).await?;
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1873,8 +1844,6 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

-        self.pending_bytes = 0;
-
        Ok(())
    }

@@ -1891,7 +1860,7 @@ impl<'a> DatadirModification<'a> {
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, _, value)) = values.last() {
+            if let Some((_, value)) = values.last() {
                return if let Value::Image(img) = value {
                    Ok(img.clone())
                } else {
@@ -1919,17 +1888,13 @@ impl<'a> DatadirModification<'a> {
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
-                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                *last_value = val;
                return;
            }
        }
-
-        let val_serialized_size = val.serialized_size().unwrap() as usize;
-        self.pending_bytes += val_serialized_size;
-        values.push((self.lsn, val_serialized_size, val));
+        values.push((self.lsn, val));
    }

    fn delete(&mut self, key_range: Range<Key>) {
@@ -2059,7 +2024,7 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -393,7 +393,7 @@ struct PageServerTask {

    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: TenantShardId,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,

    mutable: Mutex<MutableTaskState>,
@@ -405,7 +405,7 @@ struct PageServerTask {
 pub fn spawn<F>(
    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
-    tenant_shard_id: TenantShardId,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    name: &str,
    future: F,
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
+                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
@@ -573,8 +573,13 @@ pub async fn shutdown_tasks(
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
-                // warn to catch these in tests; there shouldn't be any
-                warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                if tenant_shard_id.is_none() {
+                    // there are quite few of these
+                    info!(name = task.name, kind = ?task_kind, "stopping global task");
+                } else {
+                    // warn to catch these in tests; there shouldn't be any
+                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                }
            }
            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
                .await
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -798,7 +798,7 @@ impl Tenant {
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            TaskKind::Attach,
-            tenant_shard_id,
+            Some(tenant_shard_id),
            None,
            "attach tenant",
            async move {
@@ -3741,21 +3741,13 @@ impl Tenant {
    /// less than this (via eviction and on-demand downloads), but this function enables
    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
-    ///
-    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
-    /// than they report here, due to layer eviction.  Tenants with many active branches may
-    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
+        let mut wanted = 0;
        let timelines = self.timelines.lock().unwrap();
-
-        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
-        // reflects the observation that on tenants with multiple large branches, typically only one
-        // of them is used actively enough to occupy space on disk.
-        timelines
-            .values()
-            .map(|t| t.metrics.visible_physical_size_gauge.get())
-            .max()
-            .unwrap_or(0)
+        for timeline in timelines.values() {
+            wanted += timeline.metrics.visible_physical_size_gauge.get();
+        }
+        wanted
    }
 }

@@ -5940,10 +5932,10 @@ mod tests {
            .await
            .unwrap();

-        // the default aux file policy to switch is v2 if not set by the admins
+        // the default aux file policy to switch is v1 if not set by the admins
        assert_eq!(
            harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::default_tenant_config()
+            AuxFilePolicy::V1
        );
        let (tenant, ctx) = harness.load().await;

@@ -5987,8 +5979,8 @@ mod tests {
        );
        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
+            Some(AuxFilePolicy::V1),
+            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
        );

        // we can read everything from the storage
@@ -6010,8 +6002,8 @@ mod tests {

        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "keep v2 storage format when new files are written"
+            Some(AuxFilePolicy::V1),
+            "keep v1 storage format when new files are written"
        );

        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6027,7 +6019,7 @@ mod tests {

        // child copies the last flag even if that is not on remote storage yet
        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));

        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
        assert_eq!(files.get("pg_logical/mappings/test1"), None);
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
 }

 mod page_caching;
+pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;

 impl EphemeralFile {
@@ -51,10 +52,12 @@ impl EphemeralFile {
        )
        .await?;

+        let prewarm = conf.l0_flush.prewarm_on_write();
+
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, gate_guard),
+            rw: page_caching::RW::new(file, prewarm, gate_guard),
        })
    }

@@ -79,8 +82,6 @@ impl EphemeralFile {
        self.rw.read_blk(blknum, ctx).await
    }

-    #[cfg(test)]
-    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
@@ -88,30 +89,17 @@ impl EphemeralFile {
    ) -> Result<u64, io::Error> {
        let pos = self.rw.bytes_written();

-        let mut len_bytes = std::io::Cursor::new(Vec::new());
-        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
-            srcbuf.len(),
-            &mut len_bytes,
-        );
-        let len_bytes = len_bytes.into_inner();
-
        // Write the length field
-        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+        if srcbuf.len() < 0x80 {
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];

-        // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
-
-        Ok(pos)
-    }
-
-    /// Returns the offset at which the first byte of the input was written, for use
-    /// in constructing indices over the written value.
-    pub(crate) async fn write_raw(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        } else {
+            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
+            len_buf[0] |= 0x80;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        }

        // Write the payload
        self.rw.write_all_borrowed(srcbuf, ctx).await?;
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,15 +1,15 @@
 //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
 //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-//!
-//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>

 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
 use crate::virtual_file::VirtualFile;

-use std::io::{self};
+use once_cell::sync::Lazy;
+use std::io::{self, ErrorKind};
+use std::ops::{Deref, Range};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;

@@ -18,17 +18,33 @@ use super::zero_padded_read_write;
 /// See module-level comment.
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
+    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
    _gate_guard: utils::sync::gate::GateGuard,
 }

+/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
+/// should we pre-warm the [`crate::page_cache`] with the contents?
+#[derive(Clone, Copy)]
+pub enum PrewarmOnWrite {
+    Yes,
+    No,
+}
+
 impl RW {
-    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
+    pub fn new(
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+        _gate_guard: utils::sync::gate::GateGuard,
+    ) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
+            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
+                page_cache_file_id,
+                file,
+                prewarm_on_write,
+            )),
            _gate_guard,
        }
    }
@@ -68,10 +84,10 @@ impl RW {
        let vec = Vec::with_capacity(size);

        // read from disk what we've already flushed
-        let file_size_tracking_writer = self.rw.as_writer();
-        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
-        let mut vec = file_size_tracking_writer
-            .as_inner()
+        let writer = self.rw.as_writer();
+        let flushed_range = writer.written_range();
+        let mut vec = writer
+            .file
            .read_exact_at(
                vec.slice(0..(flushed_range.end - flushed_range.start)),
                u64::try_from(flushed_range.start).unwrap(),
@@ -106,7 +122,7 @@ impl RW {
                            format!(
                                "ephemeral file: read immutable page #{}: {}: {:#}",
                                blknum,
-                                self.rw.as_writer().as_inner().path,
+                                self.rw.as_writer().file.path,
                                e,
                            ),
                        )
@@ -116,7 +132,7 @@ impl RW {
                    }
                    page_cache::ReadBufResult::NotFound(write_guard) => {
                        let write_guard = writer
-                            .as_inner()
+                            .file
                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                            .await?;
                        let read_guard = write_guard.mark_valid();
@@ -138,16 +154,137 @@ impl Drop for RW {

        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let path = &self.rw.as_writer().as_inner().path;
-        let res = std::fs::remove_file(path);
+        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
                // just never log the not found errors, we cannot do anything for them; on detach
                // the tenant directory is already gone.
                //
                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.rw.as_writer().file.path,
+                    e
+                );
            }
        }
    }
 }
+
+struct PreWarmingWriter {
+    prewarm_on_write: PrewarmOnWrite,
+    nwritten_blocks: u32,
+    page_cache_file_id: page_cache::FileId,
+    file: VirtualFile,
+}
+
+impl PreWarmingWriter {
+    fn new(
+        page_cache_file_id: page_cache::FileId,
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+    ) -> Self {
+        Self {
+            prewarm_on_write,
+            nwritten_blocks: 0,
+            page_cache_file_id,
+            file,
+        }
+    }
+
+    /// Return the byte range within `file` that has been written though `write_all`.
+    ///
+    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
+    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
+        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
+        struct Wrapper(Range<usize>);
+        impl Deref for Wrapper {
+            type Target = Range<usize>;
+            fn deref(&self) -> &Range<usize> {
+                &self.0
+            }
+        }
+        Wrapper(0..nwritten_blocks * PAGE_SZ)
+    }
+}
+
+impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
+    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
+        &mut self,
+        buf: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
+        let buflen = buf.len();
+        assert_eq!(
+            buflen % PAGE_SZ,
+            0,
+            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
+        );
+
+        // Do the IO.
+        let buf = match self.file.write_all(buf, ctx).await {
+            (buf, Ok(nwritten)) => {
+                assert_eq!(nwritten, buflen);
+                buf
+            }
+            (_, Err(e)) => {
+                return Err(std::io::Error::new(
+                    ErrorKind::Other,
+                    // order error before path because path is long and error is short
+                    format!(
+                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
+                        self.nwritten_blocks, buflen, e, self.file.path,
+                    ),
+                ));
+            }
+        };
+
+        let nblocks = buflen / PAGE_SZ;
+        let nblocks32 = u32::try_from(nblocks).unwrap();
+
+        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
+            // Pre-warm page cache with the contents.
+            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+            // benefits the code that writes InMemoryLayer=>L0 layers.
+
+            let cache = page_cache::get();
+            static CTX: Lazy<RequestContext> = Lazy::new(|| {
+                RequestContext::new(
+                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                    crate::context::DownloadBehavior::Error,
+                )
+            });
+            for blknum_in_buffer in 0..nblocks {
+                let blk_in_buffer =
+                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+                let blknum = self
+                    .nwritten_blocks
+                    .checked_add(blknum_in_buffer as u32)
+                    .unwrap();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                    .await
+                {
+                    Err(e) => {
+                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                    }
+                    Ok(v) => match v {
+                        page_cache::ReadBufResult::Found(_guard) => {
+                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                                      and this function takes &mut self, so, no concurrent read_blk is possible");
+                        }
+                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                            write_guard.copy_from_slice(blk_in_buffer);
+                            let _ = write_guard.mark_valid();
+                        }
+                    },
+                }
+            }
+        }
+
+        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
+        Ok((buflen, buf))
+    }
+}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -464,7 +464,7 @@ impl LayerMap {
    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
+        if Self::is_l0(&layer_desc.key_range) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

@@ -483,7 +483,7 @@ impl LayerMap {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
+        if Self::is_l0(&layer_desc.key_range) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -600,8 +600,8 @@ impl LayerMap {
    }

    /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
-        is_delta_layer && key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(key_range: &Range<Key>) -> bool {
+        key_range == &(Key::MIN..Key::MAX)
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -628,7 +628,7 @@ impl LayerMap {
    ///      than just the current partition_range.
    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
-        if !Self::is_l0(&layer.key_range, layer.is_delta) {
+        if !Self::is_l0(&layer.key_range) {
            return true;
        }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -565,7 +565,7 @@ mod tests {
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
-            74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
+            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -574,7 +574,7 @@ mod tests {
            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 16, // pg_version (4 bytes)
+            0, 0, 0, 15, // pg_version (4 bytes)
            /* padding bytes */
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                self.tenant_shard_id,
+                Some(self.tenant_shard_id),
                Some(self.timeline_id),
                "remote upload",
                async move {
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,7 +8,6 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
    context::RequestContext,
    disk_usage_eviction_task::DiskUsageEvictionInfo,
-    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
 };

@@ -106,9 +105,6 @@ pub(crate) struct SecondaryTenant {

    // Sum of layer sizes on local disk
    pub(super) resident_size_metric: UIntGauge,
-
-    // Sum of layer sizes in the most recently downloaded heatmap
-    pub(super) heatmap_total_size_metric: UIntGauge,
 }

 impl Drop for SecondaryTenant {
@@ -116,7 +112,6 @@ impl Drop for SecondaryTenant {
        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
-        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
    }
 }

@@ -133,10 +128,6 @@ impl SecondaryTenant {
            .get_metric_with_label_values(&[&tenant_id, &shard_id])
            .unwrap();

-        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id])
-            .unwrap();
-
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -154,7 +145,6 @@ impl SecondaryTenant {
            progress: std::sync::Mutex::default(),

            resident_size_metric,
-            heatmap_total_size_metric,
        })
    }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -829,12 +829,6 @@ impl<'a> TenantDownloader<'a> {
            layers_downloaded: 0,
            bytes_downloaded: 0,
        };
-
-        // Also expose heatmap bytes_total as a metric
-        self.secondary_state
-            .heatmap_total_size_metric
-            .set(heatmap_stats.bytes);
-
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,7 +2,7 @@

 pub mod delta_layer;
 pub mod image_layer;
-pub mod inmemory_layer;
+pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -232,18 +232,6 @@ pub struct DeltaLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

-impl DeltaLayerInner {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        format!(
-            "delta {}..{} {}..{}",
-            self.key_range().start,
-            self.key_range().end,
-            self.lsn_range().start,
-            self.lsn_range().end
-        )
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -1539,10 +1527,6 @@ pub struct DeltaLayerIterator<'a> {
 }

 impl<'a> DeltaLayerIterator<'a> {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        self.delta_layer.layer_dbg_info()
-    }
-
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -167,17 +167,6 @@ pub struct ImageLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

-impl ImageLayerInner {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        format!(
-            "image {}..{} {}",
-            self.key_range().start,
-            self.key_range().end,
-            self.lsn()
-        )
-    }
-}
-
 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
@@ -1035,10 +1024,6 @@ pub struct ImageLayerIterator<'a> {
 }

 impl<'a> ImageLayerIterator<'a> {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        self.image_layer.layer_dbg_info()
-    }
-
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache};
+use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -33,7 +33,7 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
@@ -249,7 +249,9 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
-    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        let inner = self.inner.read().await;
+
        let end_str = self.end_lsn_or_max();

        println!(
@@ -257,6 +259,39 @@ impl InMemoryLayer {
            self.timeline_id, self.start_lsn, end_str,
        );

+        if !verbose {
+            return Ok(());
+        }
+
+        let cursor = inner.file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, vec_map) in inner.index.iter() {
+            for (lsn, pos) in vec_map.as_slice() {
+                let mut desc = String::new();
+                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                let val = Value::des(&buf);
+                match val {
+                    Ok(Value::Image(img)) => {
+                        write!(&mut desc, " img {} bytes", img.len())?;
+                    }
+                    Ok(Value::WalRecord(rec)) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
+                        write!(
+                            &mut desc,
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )?;
+                    }
+                    Err(err) => {
+                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+                    }
+                }
+                println!("  key {} at {}: {}", key, lsn, desc);
+            }
+        }
+
        Ok(())
    }

@@ -320,82 +355,6 @@ impl InMemoryLayer {
    }
 }

-/// Offset of a particular Value within a serialized batch.
-struct SerializedBatchOffset {
-    key: CompactKey,
-    lsn: Lsn,
-    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
-    offset: u64,
-}
-
-pub struct SerializedBatch {
-    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
-    pub(crate) raw: Vec<u8>,
-
-    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
-    offsets: Vec<SerializedBatchOffset>,
-
-    /// The highest LSN of any value in the batch
-    pub(crate) max_lsn: Lsn,
-}
-
-impl SerializedBatch {
-    /// Write a blob length in the internal format of the EphemeralFile
-    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
-        use std::io::Write;
-
-        if len < 0x80 {
-            // short one-byte length header
-            let len_buf = [len as u8];
-
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        } else {
-            let mut len_buf = u32::to_be_bytes(len as u32);
-            len_buf[0] |= 0x80;
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        }
-    }
-
-    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
-        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
-        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
-        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
-        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
-
-        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        for (key, lsn, val_ser_size, val) in batch {
-            let relative_off = cursor.position();
-
-            Self::write_blob_length(val_ser_size, &mut cursor);
-            val.ser_into(&mut cursor)
-                .expect("Writing into in-memory buffer is infallible");
-
-            offsets.push(SerializedBatchOffset {
-                key,
-                lsn,
-                offset: relative_off,
-            });
-            max_lsn = std::cmp::max(max_lsn, lsn);
-        }
-
-        let buffer = cursor.into_inner();
-
-        // Assert that we didn't do any extra allocations while building buffer.
-        debug_assert!(buffer.len() <= buffer_size);
-
-        Self {
-            raw: buffer,
-            offsets,
-            max_lsn,
-        }
-    }
-}
-
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -456,20 +415,37 @@ impl InMemoryLayer {
        })
    }

-    // Write path.
-    pub async fn put_batch(
+    // Write operations
+
+    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
+    /// Adds the page version to the in-memory tree
+    pub async fn put_value(
        &self,
-        serialized_batch: SerializedBatch,
+        key: CompactKey,
+        lsn: Lsn,
+        buf: &[u8],
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
        self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+    }

-        let base_off = {
-            inner
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: CompactKey,
+        lsn: Lsn,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
+
+        let off = {
+            locked_inner
                .file
-                .write_raw(
-                    &serialized_batch.raw,
+                .write_blob(
+                    buf,
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::InMemoryLayer)
                        .build(),
@@ -477,23 +453,15 @@ impl InMemoryLayer {
                .await?
        };

-        for SerializedBatchOffset {
-            key,
-            lsn,
-            offset: relative_off,
-        } in serialized_batch.offsets
-        {
-            let off = base_off + relative_off;
-            let vec_map = inner.index.entry(key).or_default();
-            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-            if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!("Key {} at {} already exists", key, lsn);
-            }
+        let vec_map = locked_inner.index.entry(key).or_default();
+        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!("Key {} at {} already exists", key, lsn);
        }

-        let size = inner.file.len();
-        inner.resource_units.maybe_publish_size(size);
+        let size = locked_inner.file.len();
+        locked_inner.resource_units.maybe_publish_size(size);

        Ok(())
    }
@@ -568,6 +536,7 @@ impl InMemoryLayer {

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
+            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };

@@ -599,6 +568,34 @@ impl InMemoryLayer {
        .await?;

        match l0_flush_global_state {
+            l0_flush::Inner::PageCached => {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::InMemoryLayer)
+                    .build();
+
+                let mut buf = Vec::new();
+
+                let cursor = inner.file.block_cursor();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let (tmp, res) = delta_layer_writer
+                            .put_value_bytes(
+                                Key::from_compact(*key),
+                                *lsn,
+                                buf.slice_len(),
+                                will_init,
+                                &ctx,
+                            )
+                            .await;
+                        res?;
+                        buf = tmp.into_raw_slice().into_inner();
+                    }
+                }
+            }
            l0_flush::Inner::Direct { .. } => {
                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                assert_eq!(
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1296,10 +1296,7 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(
-                    &self.layer_desc().key_range,
-                    self.layer_desc().is_delta,
-                ),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -256,10 +256,6 @@ impl LayerName {
            LayerName::Delta(layer) => &layer.key_range,
        }
    }
-
-    pub fn is_delta(&self) -> bool {
-        matches!(self, LayerName::Delta(_))
-    }
 }

 impl fmt::Display for LayerName {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -3,7 +3,6 @@ use std::{
    collections::{binary_heap, BinaryHeap},
 };

-use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;

@@ -27,13 +26,6 @@ impl<'a> LayerRef<'a> {
            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
        }
    }
-
-    fn layer_dbg_info(&self) -> String {
-        match self {
-            Self::Image(x) => x.layer_dbg_info(),
-            Self::Delta(x) => x.layer_dbg_info(),
-        }
-    }
 }

 enum LayerIterRef<'a> {
@@ -48,13 +40,6 @@ impl LayerIterRef<'_> {
            Self::Image(x) => x.next().await,
        }
    }
-
-    fn layer_dbg_info(&self) -> String {
-        match self {
-            Self::Image(x) => x.layer_dbg_info(),
-            Self::Delta(x) => x.layer_dbg_info(),
-        }
-    }
 }

 /// This type plays several roles at once
@@ -90,11 +75,6 @@ impl<'a> PeekableLayerIterRef<'a> {
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        let result = self.peeked.take();
        self.peeked = self.iter.next().await?;
-        if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
-            if (k1, l1) < (k2, l2) {
-                bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
-            }
-        }
        Ok(result)
    }
 }
@@ -198,12 +178,7 @@ impl<'a> IteratorWrapper<'a> {
        let iter = PeekableLayerIterRef::create(iter).await?;
        if let Some((k1, l1, _)) = iter.peek() {
            let (k2, l2) = first_key_lower_bound;
-            if (k1, l1) < (k2, l2) {
-                bail!(
-                    "layer key range did not include the first key in the layer: {}",
-                    layer.layer_dbg_info()
-                );
-            }
+            debug_assert!((k1, l1) >= (k2, l2));
        }
        *self = Self::Loaded { iter };
        Ok(())
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -61,12 +61,21 @@ impl BackgroundLoopKind {
    }
 }

+static PERMIT_GAUGES: once_cell::sync::Lazy<
+    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
+> = once_cell::sync::Lazy::new(|| {
+    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
+    }))
+});
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
+    let _guard = PERMIT_GAUGES[loop_kind].guard();

    pausable_failpoint!(
        "initial-size-calculation-permit-pause",
@@ -89,7 +98,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
-        tenant_shard_id,
+        Some(tenant_shard_id),
        None,
        &format!("compactor for tenant {tenant_shard_id}"),
        {
@@ -112,7 +121,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
-        tenant_shard_id,
+        Some(tenant_shard_id),
        None,
        &format!("garbage collector for tenant {tenant_shard_id}"),
        {
@@ -135,7 +144,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::IngestHousekeeping,
-        tenant_shard_id,
+        Some(tenant_shard_id),
        None,
        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
        {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -44,8 +44,10 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
+    bin_ser::BeSer,
    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
+    vec_map::VecMap,
 };

 use std::pin::pin;
@@ -135,10 +137,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{
-    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
-    upload_queue::NotInitialized,
-};
+use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -2234,11 +2233,6 @@ impl Timeline {

                handles: Default::default(),
            };
-
-            if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1");
-            }
-
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;

@@ -2287,7 +2281,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::LayerFlushTask,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "layer flush task",
            async move {
@@ -2641,7 +2635,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "initial size calculation",
            // NB: don't log errors here, task_mgr will do that.
@@ -2809,7 +2803,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "ondemand logical size calculation",
            async move {
@@ -3002,10 +2996,7 @@ impl Timeline {
        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
        //   the layer is likely to be covered by an image layer during compaction.
        layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((
-                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
-                desc.lsn_range.end,
-            ))
+            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
        });

        let layers = layers
@@ -3598,6 +3589,34 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

+            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
+            // This code path will not be hit during regression tests. After #7099 we have a single partition
+            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
+            // to be fixed.
+
+            // For metadata, always create delta layers.
+            let delta_layer = if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single metadata keyspace"
+                );
+                let metadata_keyspace = &metadata_partition.parts[0];
+                self.create_delta_layer(
+                    &frozen_layer,
+                    Some(
+                        metadata_keyspace.0.ranges.first().unwrap().start
+                            ..metadata_keyspace.0.ranges.last().unwrap().end,
+                    ),
+                    ctx,
+                )
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+            } else {
+                None
+            };
+
+            // For image layers, we add them immediately into the layer map.
            let mut layers_to_upload = Vec::new();
            layers_to_upload.extend(
                self.create_image_layers(
@@ -3608,27 +3627,13 @@ impl Timeline {
                )
                .await?,
            );
-            if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
-                );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
-            }

-            (layers_to_upload, None)
+            if let Some(delta_layer) = delta_layer {
+                layers_to_upload.push(delta_layer.clone());
+                (layers_to_upload, Some(delta_layer))
+            } else {
+                (layers_to_upload, None)
+            }
        } else {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
@@ -4038,6 +4043,8 @@ impl Timeline {
        mode: ImageLayerCreationMode,
        start: Key,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
+
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::default();
        let data = self
@@ -4203,13 +4210,15 @@ impl Timeline {
                        "metadata keys must be partitioned separately"
                    );
                }
+                if mode == ImageLayerCreationMode::Initial {
+                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
+                }
                if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
                    // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
                    // might mess up with evictions.
                    start = img_range.end;
                    continue;
                }
-                // For initial and force modes, we always generate image layers for metadata keys.
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4217,8 +4226,7 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            }
-            if let ImageLayerCreationMode::Force = mode {
+            } else if let ImageLayerCreationMode::Force = mode {
                // When forced to create image layers, we might try and create them where they already
                // exist.  This mode is only used in tests/debug.
                let layers = self.layers.read().await;
@@ -4232,7 +4240,6 @@ impl Timeline {
                        img_range.start,
                        img_range.end
                    );
-                    start = img_range.end;
                    continue;
                }
            }
@@ -4588,7 +4595,7 @@ impl Timeline {
                // for compact_level0_phase1 creating an L0, which does not happen in practice
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
                return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
            } else {
                insert_layers.push(l.clone());
@@ -5155,7 +5162,7 @@ impl Timeline {
        let task_id = task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            "download all remote layers task",
            async move {
@@ -5583,6 +5590,44 @@ enum OpenLayerAction {
 }

 impl<'a> TimelineWriter<'a> {
+    /// Put a new page version that can be constructed from a WAL record
+    ///
+    /// This will implicitly extend the relation, if the page is beyond the
+    /// current end-of-file.
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
+        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
+    }
+
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -5688,58 +5733,18 @@ impl<'a> TimelineWriter<'a> {
    }

    /// Put a batch of keys at the specified Lsns.
+    ///
+    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
    pub(crate) async fn put_batch(
        &mut self,
-        batch: Vec<(CompactKey, Lsn, usize, Value)>,
+        batch: VecMap<Lsn, (Key, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if batch.is_empty() {
-            return Ok(());
+        for (lsn, (key, val)) in batch {
+            self.put(key, lsn, &val, ctx).await?
        }

-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
-        let batch_max_lsn = serialized_batch.max_lsn;
-        let buf_size: u64 = serialized_batch.raw.len() as u64;
-
-        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
-        let layer = self
-            .handle_open_layer_action(batch_max_lsn, action, ctx)
-            .await?;
-
-        let res = layer.put_batch(serialized_batch, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(batch_max_lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
-        }
-
-        res
-    }
-
-    #[cfg(test)]
-    /// Test helper, for tests that would like to poke individual values without composing a batch
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        use utils::bin_ser::BeSer;
-        let val_ser_size = value.serialized_size().unwrap() as usize;
-        self.put_batch(
-            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
-            ctx,
-        )
-        .await
+        Ok(())
    }

    pub(crate) async fn delete_batch(
@@ -5880,7 +5885,7 @@ mod tests {
            };

            // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
+            if !LayerMap::is_l0(layer.name.key_range()) {
                assert!(layer_lsn <= last_lsn);
                last_lsn = layer_lsn;
            }
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            tenant_shard_id,
+            Some(tenant_shard_id),
            Some(timeline_id),
            "timeline_delete",
            async move {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            self.tenant_shard_id,
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
            &format!(
                "layer eviction for {}/{}",
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    pgdatadir_mapping::DatadirModification,
-    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
@@ -345,10 +345,7 @@ pub(super) async fn handle_walreceiver_connection(
                        // Commit every ingest_batch_size records. Even if we filtered out
                        // all records, we still need to call commit to advance the LSN.
                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size
-                            || modification.approx_pending_bytes()
-                                > DatadirModification::MAX_PENDING_BYTES
-                        {
+                        if uncommitted_records >= ingest_batch_size {
                            WAL_INGEST
                                .records_committed
                                .inc_by(uncommitted_records - filtered_records);
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;

 use pageserver_api::models::PageserverUtilization;

-use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
+use crate::{config::PageServerConf, tenant::mgr::TenantManager};

 pub(crate) fn regenerate(
    conf: &PageServerConf,
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
        disk_usable_pct,
        shard_count,
        max_shard_count: MAX_SHARDS,
-        utilization_score: None,
+        utilization_score: 0,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };

-    // Initialize `PageserverUtilization::utilization_score`
-    let score = doc.cached_score();
-    NODE_UTILIZATION_SCORE.set(score);
+    doc.refresh_score();
+
+    // TODO: make utilization_score into a metric

    Ok(doc)
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -756,23 +756,11 @@ impl VirtualFile {
        })
    }

-    /// The function aborts the process if the error is fatal.
    async fn write_at<B: IoBuf + Send>(
        &self,
        buf: FullSlice<B>,
        offset: u64,
        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
-    ) -> (FullSlice<B>, Result<usize, Error>) {
-        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
-        let result = result.maybe_fatal_err("write_at");
-        (slice, result)
-    }
-
-    async fn write_at_inner<B: IoBuf + Send>(
-        &self,
-        buf: FullSlice<B>,
-        offset: u64,
-        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -110,8 +110,7 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)

 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
-		/* We need exclusive lock here because of LRU list manipulation */
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		LWLockAcquire(relsize_lock, LW_SHARED);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
 		if (entry != NULL)
 		{
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if previously elected leader was me;
-			 * plain restart of walproposer not intervened by concurrent
-			 * compute (who could generate WAL) is ok.
+			 * However, allow to proceed if last_log_term on the node which gave
+			 * the highest vote (i.e. point where we are going to start writing)
+			 * actually had been won by me; plain restart of walproposer not
+			 * intervened by concurrent compute which wrote WAL is ok.
+			 *
+			 * This avoids compute crash after manual term_bump.
 			 */
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
@@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk)
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
-			 * Another compute with higher term is running. Panic to restart
-			 * PG as we likely need to retake basebackup. However, don't dump
-			 * core as this is kinda expected scenario.
+			 *
+			 * Term has changed to higher one, probably another compute is
+			 * running. If this is the case we could PANIC as well because
+			 * likely it inserted some data and our basebackup is unsuitable
+			 * anymore. However, we also bump term manually (term_bump endpoint)
+			 * on safekeepers for migration purposes, in this case we do want
+			 * compute to stay alive. So restart walproposer with FATAL instead
+			 * of panicking; if basebackup is spoiled next election will notice
+			 * this.
 			 */
-			disable_core_dump();
-			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
+			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
 				   sk->appendResponse.term, wp->propTerm);
 		}
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -2,7 +2,6 @@

 import argparse
 import enum
-import os
 import subprocess
 import sys
 from typing import List
@@ -94,7 +93,7 @@ if __name__ == "__main__":
        "--no-color",
        action="store_true",
        help="disable colored output",
-        default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
+        default=not sys.stdout.isatty(),
    )
    args = parser.parse_args()

--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,21 +1,15 @@
-use std::{
-    future::Future,
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::{future::Future, sync::Arc, time::Duration};

 use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
-use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;

-use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
+use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};

 // TODO(conrad): make these configurable.
-const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
@@ -23,56 +17,30 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;

 /// How to get the JWT auth rules
 pub trait FetchAuthRules: Clone + Send + Sync + 'static {
-    fn fetch_auth_rules(
-        &self,
-        role_name: RoleName,
-    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
+    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
 }

-pub struct AuthRule {
-    pub id: String,
-    pub jwks_url: url::Url,
-    pub audience: Option<String>,
+#[derive(Clone)]
+struct FetchAuthRulesFromCplane {
+    #[allow(dead_code)]
+    endpoint: EndpointIdInt,
+}
+
+impl FetchAuthRules for FetchAuthRulesFromCplane {
+    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+        Err(anyhow::anyhow!("not yet implemented"))
+    }
+}
+
+pub struct AuthRules {
+    jwks_urls: Vec<url::Url>,
 }

 #[derive(Default)]
 pub struct JwkCache {
    client: reqwest::Client,

-    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
-}
-
-pub struct JwkCacheEntry {
-    /// Should refetch at least every hour to verify when old keys have been removed.
-    /// Should refetch when new key IDs are seen only every 5 minutes or so
-    last_retrieved: Instant,
-
-    /// cplane will return multiple JWKs urls that we need to scrape.
-    key_sets: ahash::HashMap<String, KeySet>,
-}
-
-impl JwkCacheEntry {
-    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
-        self.key_sets.values().find_map(|key_set| {
-            key_set
-                .find_key(key_id)
-                .map(|jwk| (jwk, key_set.audience.as_deref()))
-        })
-    }
-}
-
-struct KeySet {
-    jwks: jose_jwk::JwkSet,
-    audience: Option<String>,
-}
-
-impl KeySet {
-    fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
-        self.jwks
-            .keys
-            .iter()
-            .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
-    }
+    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
 }

 pub struct JwkCacheEntryLock {
@@ -89,6 +57,15 @@ impl Default for JwkCacheEntryLock {
    }
 }

+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
+}
+
 impl JwkCacheEntryLock {
    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
        JwkRenewalPermit::acquire_permit(self).await
@@ -102,7 +79,6 @@ impl JwkCacheEntryLock {
        &self,
        _permit: JwkRenewalPermit<'_>,
        client: &reqwest::Client,
-        role_name: RoleName,
        auth_rules: &F,
    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
        // double check that no one beat us to updating the cache.
@@ -115,19 +91,20 @@ impl JwkCacheEntryLock {
            }
        }

-        let rules = auth_rules.fetch_auth_rules(role_name).await?;
-        let mut key_sets =
-            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
+        let rules = auth_rules.fetch_auth_rules().await?;
+        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
+            rules.jwks_urls.len(),
+            ahash::RandomState::new(),
+        );
        // TODO(conrad): run concurrently
        // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
-        for rule in rules {
-            let req = client.get(rule.jwks_url.clone());
+        for url in rules.jwks_urls {
+            let req = client.get(url.clone());
            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
-            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
            match req.send().await.and_then(|r| r.error_for_status()) {
                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                // I expect these failures would be quite sparse.
-                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
+                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
                Ok(r) => {
                    let resp: http::Response<reqwest::Body> = r.into();
                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
@@ -136,17 +113,9 @@ impl JwkCacheEntryLock {
                    )
                    .await
                    {
-                        Err(e) => {
-                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
-                        }
+                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
                        Ok(jwks) => {
-                            key_sets.insert(
-                                rule.id,
-                                KeySet {
-                                    jwks,
-                                    audience: rule.audience,
-                                },
-                            );
+                            key_sets.insert(url, jwks);
                        }
                    }
                }
@@ -164,9 +133,7 @@ impl JwkCacheEntryLock {

    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
        client: &reqwest::Client,
-        role_name: RoleName,
        fetch: &F,
    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
        let now = Instant::now();
@@ -174,20 +141,18 @@ impl JwkCacheEntryLock {

        // if we have no cached JWKs, try and get some
        let Some(cached) = guard else {
-            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self.renew_jwks(permit, client, fetch).await;
        };

        let last_update = now.duration_since(cached.last_retrieved);

        // check if the cached JWKs need updating.
        if last_update > MAX_RENEW {
-            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;

            // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self.renew_jwks(permit, client, fetch).await;
        }

        // every 5 minutes we should spawn a job to eagerly update the token.
@@ -199,7 +164,7 @@ impl JwkCacheEntryLock {
                let client = client.clone();
                let fetch = fetch.clone();
                tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
+                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
                    }
                });
@@ -213,10 +178,8 @@ impl JwkCacheEntryLock {

    async fn check_jwt<F: FetchAuthRules>(
        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
-        jwt: &str,
+        jwt: String,
        client: &reqwest::Client,
-        role_name: RoleName,
        fetch: &F,
    ) -> Result<(), anyhow::Error> {
        // JWT compact form is defined to be
@@ -226,36 +189,36 @@ impl JwkCacheEntryLock {
        let (header_payload, signature) = jwt
            .rsplit_once(".")
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let (header, payload) = header_payload
+        let (header, _payload) = header_payload
            .split_once(".")
            .context("Provided authentication token is not a valid JWT encoding")?;

        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
+        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
            .context("Provided authentication token is not a valid JWT encoding")?;

        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;

        ensure!(header.typ == "JWT");
-        let kid = header.key_id.context("missing key id")?;
+        let kid = header.kid.context("missing key id")?;

-        let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
-            .await?;
+        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;

        // get the key from the JWKs if possible. If not, wait for the keys to update.
-        let (jwk, expected_audience) = loop {
-            match guard.find_jwk_and_audience(kid) {
+        let jwk = loop {
+            let jwk = guard
+                .key_sets
+                .values()
+                .flat_map(|jwks| &jwks.keys)
+                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
+
+            match jwk {
                Some(jwk) => break jwk,
                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
-                    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-
                    let permit = self.acquire_permit().await;
-                    guard = self
-                        .renew_jwks(permit, client, role_name.clone(), fetch)
-                        .await?;
+                    guard = self.renew_jwks(permit, client, fetch).await?;
                }
                _ => {
                    bail!("jwk not found");
@@ -264,7 +227,7 @@ impl JwkCacheEntryLock {
        };

        ensure!(
-            jwk.is_supported(&header.algorithm),
+            jwk.is_supported(&header.alg),
            "signature algorithm not supported"
        );

@@ -278,60 +241,31 @@ impl JwkCacheEntryLock {
            key => bail!("unsupported key type {key:?}"),
        };

-        let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-
-        tracing::debug!(?payload, "JWT signature valid with claims");
-
-        match (expected_audience, payload.audience) {
-            // check the audience matches
-            (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
-            // the audience is expected but is missing
-            (Some(_), None) => bail!("invalid JWT token audience"),
-            // we don't care for the audience field
-            (None, _) => {}
-        }
-
-        let now = SystemTime::now();
-
-        if let Some(exp) = payload.expiration {
-            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
-        }
-
-        if let Some(nbf) = payload.not_before {
-            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
-        }
+        // TODO(conrad): verify iss, exp, nbf, etc...

        Ok(())
    }
 }

 impl JwkCache {
-    pub async fn check_jwt<F: FetchAuthRules>(
+    pub async fn check_jwt(
        &self,
-        ctx: &RequestMonitoring,
-        endpoint: EndpointId,
-        role_name: RoleName,
-        fetch: &F,
-        jwt: &str,
+        endpoint: EndpointIdInt,
+        jwt: String,
    ) -> Result<(), anyhow::Error> {
        // try with just a read lock first
-        let key = (endpoint, role_name.clone());
-        let entry = self.map.get(&key).as_deref().map(Arc::clone);
+        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
        let entry = match entry {
            Some(entry) => entry,
            None => {
                // acquire a write lock after to insert.
-                let entry = self.map.entry(key).or_default();
+                let entry = self.map.entry(endpoint).or_default();
                Arc::clone(&*entry)
            }
        };

-        entry
-            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
-            .await
+        let fetch = FetchAuthRulesFromCplane { endpoint };
+        entry.check_jwt(jwt, &self.client, &fetch).await
    }
 }

@@ -381,49 +315,13 @@ fn verify_rsa_signature(

 /// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
 #[derive(serde::Deserialize, serde::Serialize)]
-struct JwtHeader<'a> {
+struct JWTHeader<'a> {
    /// must be "JWT"
-    #[serde(rename = "typ")]
    typ: &'a str,
    /// must be a supported alg
-    #[serde(rename = "alg")]
-    algorithm: jose_jwa::Algorithm,
+    alg: jose_jwa::Algorithm,
    /// key id, must be provided for our usecase
-    #[serde(rename = "kid")]
-    key_id: Option<&'a str>,
-}
-
-/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
-#[derive(serde::Deserialize, serde::Serialize, Debug)]
-struct JwtPayload<'a> {
-    /// Audience - Recipient for which the JWT is intended
-    #[serde(rename = "aud")]
-    audience: Option<&'a str>,
-    /// Expiration - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
-    expiration: Option<SystemTime>,
-    /// Not before - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
-    not_before: Option<SystemTime>,
-
-    // the following entries are only extracted for the sake of debug logging.
-    /// Issuer of the JWT
-    #[serde(rename = "iss")]
-    issuer: Option<&'a str>,
-    /// Subject of the JWT (the user)
-    #[serde(rename = "sub")]
-    subject: Option<&'a str>,
-    /// Unique token identifier
-    #[serde(rename = "jti")]
-    jwt_id: Option<&'a str>,
-    /// Unique session identifier
-    #[serde(rename = "sid")]
-    session_id: Option<&'a str>,
-}
-
-fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
-    let d = <Option<u64>>::deserialize(d)?;
-    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
+    kid: Option<&'a str>,
 }

 struct JwkRenewalPermit<'a> {
@@ -490,8 +388,6 @@ impl Drop for JwkRenewalPermit<'_> {

 #[cfg(test)]
 mod tests {
-    use crate::RoleName;
-
    use super::*;

    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
@@ -535,10 +431,10 @@ mod tests {
    }

    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
-        let header = JwtHeader {
+        let header = JWTHeader {
            typ: "JWT",
-            algorithm: jose_jwa::Algorithm::Signing(sig),
-            key_id: Some(&kid),
+            alg: jose_jwa::Algorithm::Signing(sig),
+            kid: Some(&kid),
        };
        let body = typed_json::json! {{
            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
@@ -628,40 +524,33 @@ mod tests {
        struct Fetch(SocketAddr);

        impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(
-                &self,
-                _role_name: RoleName,
-            ) -> anyhow::Result<Vec<AuthRule>> {
-                Ok(vec![
-                    AuthRule {
-                        id: "foo".to_owned(),
-                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
-                        audience: None,
-                    },
-                    AuthRule {
-                        id: "bar".to_owned(),
-                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
-                        audience: None,
-                    },
-                ])
+            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+                Ok(AuthRules {
+                    jwks_urls: vec![
+                        format!("http://{}/foo", self.0).parse().unwrap(),
+                        format!("http://{}/bar", self.0).parse().unwrap(),
+                    ],
+                })
            }
        }

-        let role_name = RoleName::from("user");
-
        let jwk_cache = Arc::new(JwkCacheEntryLock::default());

-        for token in [jwt1, jwt2, jwt3, jwt4] {
-            jwk_cache
-                .check_jwt(
-                    &RequestMonitoring::test(),
-                    &token,
-                    &client,
-                    role_name.clone(),
-                    &Fetch(addr),
-                )
-                .await
-                .unwrap();
-        }
+        jwk_cache
+            .check_jwt(jwt1, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt2, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt3, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt4, &client, &Fetch(addr))
+            .await
+            .unwrap();
    }
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -173,6 +173,9 @@ struct ProxyCliArgs {
    /// cache for `role_secret` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    role_secret_cache: String,
+    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    disable_ip_check_for_http: bool,
    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
    #[clap(long)]
    redis_notifications: Option<String>,
@@ -658,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
-        accept_websockets: true,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -52,7 +52,6 @@ pub struct TlsConfig {
 }

 pub struct HttpConfig {
-    pub accept_websockets: bool,
    pub pool_options: GlobalConnPoolOptions,
    pub cancel_set: CancelSet,
    pub client_conn_threshold: u64,
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,92 +1,4 @@
-// rustc lints/lint groups
-// https://doc.rust-lang.org/rustc/lints/groups.html
-#![deny(
-    deprecated,
-    future_incompatible,
-    // TODO: consider let_underscore
-    nonstandard_style,
-    rust_2024_compatibility
-)]
-#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
-// List of denied lints from the clippy::restriction group.
-// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
-#![warn(
-    clippy::undocumented_unsafe_blocks,
-    clippy::dbg_macro,
-    clippy::empty_enum_variants_with_brackets,
-    clippy::exit,
-    clippy::float_cmp_const,
-    clippy::lossy_float_literal,
-    clippy::macro_use_imports,
-    clippy::manual_ok_or,
-    // TODO: consider clippy::map_err_ignore
-    // TODO: consider clippy::mem_forget
-    clippy::rc_mutex,
-    clippy::rest_pat_in_fully_bound_structs,
-    clippy::string_add,
-    clippy::string_to_string,
-    clippy::todo,
-    // TODO: consider clippy::unimplemented
-    // TODO: consider clippy::unwrap_used
-)]
-// List of permanently allowed lints.
-#![allow(
-    // It's ok to cast u8 to bool, etc.
-    clippy::cast_lossless,
-)]
-// List of temporarily allowed lints.
-// TODO: Switch to except() once stable with 1.81.
-// TODO: fix code and reduce list or move to permanent list above.
-#![allow(
-    clippy::cargo_common_metadata,
-    clippy::cast_possible_truncation,
-    clippy::cast_possible_wrap,
-    clippy::cast_precision_loss,
-    clippy::cast_sign_loss,
-    clippy::default_trait_access,
-    clippy::doc_markdown,
-    clippy::explicit_iter_loop,
-    clippy::float_cmp,
-    clippy::if_not_else,
-    clippy::ignored_unit_patterns,
-    clippy::implicit_hasher,
-    clippy::inconsistent_struct_constructor,
-    clippy::inline_always,
-    clippy::items_after_statements,
-    clippy::manual_assert,
-    clippy::manual_let_else,
-    clippy::manual_string_new,
-    clippy::match_bool,
-    clippy::match_same_arms,
-    clippy::match_wild_err_arm,
-    clippy::missing_errors_doc,
-    clippy::missing_panics_doc,
-    clippy::module_name_repetitions,
-    clippy::multiple_crate_versions,
-    clippy::must_use_candidate,
-    clippy::needless_for_each,
-    clippy::needless_pass_by_value,
-    clippy::needless_raw_string_hashes,
-    clippy::option_as_ref_cloned,
-    clippy::redundant_closure_for_method_calls,
-    clippy::redundant_else,
-    clippy::return_self_not_must_use,
-    clippy::similar_names,
-    clippy::single_char_pattern,
-    clippy::single_match_else,
-    clippy::struct_excessive_bools,
-    clippy::struct_field_names,
-    clippy::too_many_lines,
-    clippy::uninlined_format_args,
-    clippy::unnested_or_patterns,
-    clippy::unreadable_literal,
-    clippy::unused_async,
-    clippy::unused_self,
-    clippy::used_underscore_binding,
-    clippy::wildcard_imports
-)]
-// List of temporarily allowed lints to unblock beta/nightly.
-#![allow(unknown_lints, clippy::manual_inspect)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use std::convert::Infallible;

--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,7 +10,6 @@ mod json;
 mod sql_over_http;
 mod websocket;

-use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
@@ -27,9 +26,8 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::timeout;
-use tokio_rustls::TlsAcceptor;
+use tokio_rustls::{server::TlsStream, TlsAcceptor};
 use tokio_util::task::TaskTracker;

 use crate::cancellation::CancellationHandlerMain;
@@ -43,7 +41,7 @@ use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};

 use std::net::{IpAddr, SocketAddr};
-use std::pin::{pin, Pin};
+use std::pin::pin;
 use std::sync::Arc;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
@@ -88,18 +86,18 @@ pub async fn task_main(
        config,
        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
    });
-    let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
-        Some(config) => {
-            let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
-            // prefer http2, but support http/1.1
-            tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-            Arc::new(tls_server_config) as Arc<_>
-        }
+
+    let tls_config = match config.tls_config.as_ref() {
+        Some(config) => config,
        None => {
-            warn!("TLS config is missing");
-            Arc::new(NoTls) as Arc<_>
+            warn!("TLS config is missing, WebSocket Secure server will not be started");
+            return Ok(());
        }
    };
+    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
+    // prefer http2, but support http/1.1
+    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();

    let connections = tokio_util::task::task_tracker::TaskTracker::new();
    connections.close(); // allows `connections.wait to complete`
@@ -178,41 +176,16 @@ pub async fn task_main(
    Ok(())
 }

-pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
-impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
-pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
-
-#[async_trait]
-trait MaybeTlsAcceptor: Send + Sync + 'static {
-    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
-}
-
-#[async_trait]
-impl MaybeTlsAcceptor for rustls::ServerConfig {
-    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
-        Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?))
-    }
-}
-
-struct NoTls;
-
-#[async_trait]
-impl MaybeTlsAcceptor for NoTls {
-    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
-        Ok(Box::pin(conn))
-    }
-}
-
 /// Handles the TCP startup lifecycle.
 /// 1. Parses PROXY protocol V2
 /// 2. Handles TLS handshake
 async fn connection_startup(
    config: &ProxyConfig,
-    tls_acceptor: Arc<dyn MaybeTlsAcceptor>,
+    tls_acceptor: TlsAcceptor,
    session_id: uuid::Uuid,
    conn: TcpStream,
    peer_addr: SocketAddr,
-) -> Option<(AsyncRW, IpAddr)> {
+) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
    // handle PROXY protocol
    let (conn, peer) = match read_proxy_protocol(conn).await {
        Ok(c) => c,
@@ -268,7 +241,7 @@ async fn connection_handler(
    cancellation_handler: Arc<CancellationHandlerMain>,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    cancellation_token: CancellationToken,
-    conn: AsyncRW,
+    conn: TlsStream<ChainRW<TcpStream>>,
    peer_addr: IpAddr,
    session_id: uuid::Uuid,
 ) {
@@ -353,9 +326,7 @@ async fn request_handler(
        .map(|s| s.to_string());

    // Check if the request is a websocket upgrade request.
-    if config.http_config.accept_websockets
-        && framed_websockets::upgrade::is_upgrade_request(&request)
-    {
+    if framed_websockets::upgrade::is_upgrade_request(&request) {
        let ctx = RequestMonitoring::new(
            session_id,
            peer_addr,
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -758,7 +758,6 @@ mod tests {
    async fn test_pool() {
        let _ = env_logger::try_init();
        let config = Box::leak(Box::new(crate::config::HttpConfig {
-            accept_websockets: false,
            pool_options: GlobalConnPoolOptions {
                max_conns_per_endpoint: 2,
                gc_epoch: Duration::from_secs(1),
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -147,7 +147,7 @@ impl UserFacingError for ConnInfoError {
 fn get_conn_info(
    ctx: &RequestMonitoring,
    headers: &HeaderMap,
-    tls: Option<&TlsConfig>,
+    tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
    // HTTP only uses cleartext (for now and likely always)
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -184,22 +184,12 @@ fn get_conn_info(
        .ok_or(ConnInfoError::MissingPassword)?;
    let password = urlencoding::decode_binary(password.as_bytes());

-    let endpoint = match connection_url.host() {
-        Some(url::Host::Domain(hostname)) => {
-            if let Some(tls) = tls {
-                endpoint_sni(hostname, &tls.common_names)?
-                    .ok_or(ConnInfoError::MalformedEndpoint)?
-            } else {
-                hostname
-                    .split_once(".")
-                    .map_or(hostname, |(prefix, _)| prefix)
-                    .into()
-            }
-        }
-        Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
-            return Err(ConnInfoError::MissingHostname)
-        }
-    };
+    let hostname = connection_url
+        .host_str()
+        .ok_or(ConnInfoError::MissingHostname)?;
+
+    let endpoint =
+        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
    ctx.set_endpoint_id(endpoint.clone());

    let pairs = connection_url.query_pairs();
@@ -512,7 +502,7 @@ async fn handle_inner(
    let headers = request.headers();

    // TLS config should be there.
-    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
+    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
    info!(user = conn_info.user_info.user.as_str(), "credentials");

    // Allow connection pooling only if explicitly requested
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,6 +1,9 @@
 use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

+/// If tenant_id is provided, allow if token (claims) is for this tenant or
+/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
+/// SafekeeperData.
 pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
        (Scope::Tenant, None) => Err(AuthError(
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
 use utils::http::request::parse_query_param;

 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::TimelineCreateRequest;
 use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
+use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
 use utils::{
    auth::SwappableJwtAuth,
    http::{
@@ -114,16 +114,6 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
    })
 }

-/// List all (not deleted) timelines.
-async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-    let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
-        .iter()
-        .map(|tli| tli.ttid)
-        .collect();
-    json_response(StatusCode::OK, res)
-}
-
 /// Report info about timeline.
 async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
@@ -312,12 +302,11 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body

 /// Force persist control file.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-
    let ttid = TenantTimelineId::new(
        parse_request_param(&request, "tenant_id")?,
        parse_request_param(&request, "timeline_id")?,
    );
+    check_permission(&request, Some(ttid.tenant_id))?;

    let tli = GlobalTimelines::get(ttid)?;
    tli.write_shared_state()
@@ -330,6 +319,28 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    json_response(StatusCode::OK, ())
 }

+/// Make term at least as high as one in request. If one in request is None,
+/// increment current one.
+async fn timeline_term_bump_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let response = tli
+        .term_bump(request_data.term)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Deactivates the timeline and removes its data directory.
 async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
@@ -568,33 +579,23 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
                failpoints_handler(r, cancel).await
            })
        })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            request_span(r, tenant_delete_handler)
+        })
        // Will be used in the future instead of implicit timeline creation
        .post("/v1/tenant/timeline", |r| {
            request_span(r, timeline_create_handler)
        })
-        .get("/v1/tenant/timeline", |r| {
-            request_span(r, timeline_list_handler)
-        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            request_span(r, timeline_status_handler)
        })
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            request_span(r, timeline_delete_handler)
        })
-        .delete("/v1/tenant/:tenant_id", |r| {
-            request_span(r, tenant_delete_handler)
-        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
            |r| request_span(r, timeline_snapshot_handler),
        )
-        .post("/v1/pull_timeline", |r| {
-            request_span(r, timeline_pull_handler)
-        })
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
-            |r| request_span(r, timeline_copy_handler),
-        )
        .patch(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
            |r| request_span(r, patch_control_file_handler),
@@ -603,6 +604,17 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
            |r| request_span(r, timeline_checkpoint_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
+            |r| request_span(r, timeline_term_bump_handler),
+        )
+        .post("/v1/pull_timeline", |r| {
+            request_span(r, timeline_pull_handler)
+        })
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
+            |r| request_span(r, timeline_copy_handler),
+        )
        // for tests
        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
            request_span(r, record_safekeeper_info)
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,9 +1,10 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).

-use std::ops::Deref;
+use std::{cmp::max, ops::Deref};

 use anyhow::Result;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -12,7 +13,7 @@ use utils::{

 use crate::{
    control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
    wal_backup_partial::{self},
 };

@@ -209,6 +210,27 @@ where
        let s = self.start_change();
        self.finish_change(&s).await
    }
+
+    /// Make term at least as `to`. If `to` is None, increment current one. This
+    /// is not in safekeeper.rs because we want to be able to do it even if
+    /// timeline is offloaded.
+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let before = self.acceptor_state.term;
+        let mut state = self.start_change();
+        let new = match to {
+            Some(to) => max(state.acceptor_state.term, to),
+            None => state.acceptor_state.term + 1,
+        };
+        if new > state.acceptor_state.term {
+            state.acceptor_state.term = new;
+            self.finish_change(&state).await?;
+        }
+        let after = self.acceptor_state.term;
+        Ok(TimelineTermBumpResponse {
+            previous_term: before,
+            current_term: after,
+        })
+    }
 }

 impl<CTRL> Deref for TimelineState<CTRL>
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,6 +3,7 @@

 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -213,6 +214,10 @@ impl StateSK {
            .get_last_log_term(self.flush_lsn())
    }

+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        self.state_mut().term_bump(to).await
+    }
+
    /// Close open WAL files to release FDs.
    fn close_wal_store(&mut self) {
        if let StateSK::Loaded(sk) = self {
@@ -847,6 +852,11 @@ impl Timeline {
        Ok(res)
    }

+    pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let mut state = self.write_shared_state().await;
+        state.sk.term_bump(to).await
+    }
+
    /// Get the timeline guard for reading/writing WAL files.
    /// If WAL files are not present on disk (evicted), they will be automatically
    /// downloaded from remote storage. This is done in the manager task, which is
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -68,29 +68,16 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                    console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)

                    buildType = "release"
-                    pgVersion = "16"
+                    pgVersion = "14"
                }

                pgVersions.add(pgVersion)

-                // We use `arch` as it is returned by GitHub Actions
-                //  (RUNNER_ARCH env var): X86, X64, ARM, or ARM64
-                // Ref https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
-                let arch = ""
-                if (test.parameters.includes("'X64'")) {
-                    arch = "x86-64"
-                } else if (test.parameters.includes("'ARM64'")) {
-                    arch = "arm64"
-                } else {
-                    arch = "unknown"
-                }
-
                // Removing build type and PostgreSQL version from the test name to make it shorter
                const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
                test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
                test.pgVersion = pgVersion
                test.buildType = buildType
-                test.arch = arch

                if (test.status === "passed") {
                    passedTests[pgVersion][testName].push(test)
@@ -157,7 +144,7 @@ const reportSummary = async (params) => {
                const links = []
                for (const test of tests) {
                    const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
-                    links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
+                    links.push(`[${test.buildType}](${allureLink})`)
                }
                summary += `- \`${testName}\`: ${links.join(", ")}\n`
            }
@@ -188,7 +175,7 @@ const reportSummary = async (params) => {
                    const links = []
                    for (const test of tests) {
                        const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
+                        links.push(`[${test.buildType}](${allureLink})`)
                    }
                    summary += `- \`${testName}\`: ${links.join(", ")}\n`
                }
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -18,7 +18,6 @@ import psycopg2
 from psycopg2.extras import execute_values

 CREATE_TABLE = """
-CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN');
 CREATE TABLE IF NOT EXISTS results (
    id           BIGSERIAL PRIMARY KEY,
    parent_suite TEXT NOT NULL,
@@ -29,7 +28,6 @@ CREATE TABLE IF NOT EXISTS results (
    stopped_at   TIMESTAMPTZ NOT NULL,
    duration     INT NOT NULL,
    flaky        BOOLEAN NOT NULL,
-    arch         arch DEFAULT 'X64',
    build_type   TEXT NOT NULL,
    pg_version   INT NOT NULL,
    run_id       BIGINT NOT NULL,
@@ -37,7 +35,7 @@ CREATE TABLE IF NOT EXISTS results (
    reference    TEXT NOT NULL,
    revision     CHAR(40) NOT NULL,
    raw          JSONB COMPRESSION lz4 NOT NULL,
-    UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
+    UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id)
 );
 """

@@ -52,7 +50,6 @@ class Row:
    stopped_at: datetime
    duration: int
    flaky: bool
-    arch: str
    build_type: str
    pg_version: int
    run_id: int
@@ -124,14 +121,6 @@ def ingest_test_result(
        raw.pop("labels")
        raw.pop("extra")

-        # All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py
-        parameters = {
-            p["name"].removeprefix("__"): p["value"]
-            for p in test["parameters"]
-            if p["name"].startswith("__")
-        }
-        arch = parameters.get("arch", "UNKNOWN").strip("'")
-
        build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
        labels = {label["name"]: label["value"] for label in test["labels"]}
        row = Row(
@@ -143,7 +132,6 @@ def ingest_test_result(
            stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
            duration=test["time"]["duration"],
            flaky=test["flaky"] or test["retriesStatusChange"],
-            arch=arch,
            build_type=build_type,
            pg_version=pg_version,
            run_id=run_id,
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -44,7 +44,7 @@ run the following commands from the top of the neon.git checkout

    # test suite run
    export TEST_OUTPUT="$TEST_OUTPUT"
-    DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
+    DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py

    # for interactive use
    export NEON_REPO_DIR="$NEON_REPO_DIR"
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -6,7 +6,10 @@ use std::{
 };
 use tokio_util::sync::CancellationToken;

-use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
+use pageserver_api::{
+    controller_api::{NodeAvailability, UtilizationScore},
+    models::PageserverUtilization,
+};

 use thiserror::Error;
 use utils::id::NodeId;
@@ -84,12 +87,9 @@ impl Heartbeater {
                pageservers,
                reply: sender,
            })
-            .map_err(|_| HeartbeaterError::Cancel)?;
+            .unwrap();

-        receiver
-            .await
-            .map_err(|_| HeartbeaterError::Cancel)
-            .and_then(|x| x)
+        receiver.await.unwrap()
    }
 }

@@ -144,8 +144,7 @@ impl HeartbeaterTask {
                // goes through to the pageserver even when the node is marked offline.
                // This doesn't impact the availability observed by [`crate::service::Service`].
                let mut node_clone = node.clone();
-                node_clone
-                    .set_availability(NodeAvailability::Active(PageserverUtilization::full()));
+                node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));

                async move {
                    let response = node_clone
@@ -177,7 +176,7 @@ impl HeartbeaterTask {
                        node.get_availability()
                    {
                        PageserverState::WarmingUp {
-                            started_at: *last_seen_at,
+                            started_at: last_seen_at,
                        }
                    } else {
                        PageserverState::Offline
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1074,6 +1074,7 @@ pub fn make_router(
                RequestName("control_v1_metadata_health_list_outdated"),
            )
        })
+        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(
--- a/storage_controller/src/leadership.rs
+++ b/storage_controller/src/leadership.rs
@@ -1,135 +0,0 @@
-use std::sync::Arc;
-
-use hyper::Uri;
-use tokio_util::sync::CancellationToken;
-
-use crate::{
-    peer_client::{GlobalObservedState, PeerClient},
-    persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence},
-    service::Config,
-};
-
-/// Helper for storage controller leadership acquisition
-pub(crate) struct Leadership {
-    persistence: Arc<Persistence>,
-    config: Config,
-    cancel: CancellationToken,
-}
-
-#[derive(thiserror::Error, Debug)]
-pub(crate) enum Error {
-    #[error(transparent)]
-    Database(#[from] DatabaseError),
-}
-
-pub(crate) type Result<T> = std::result::Result<T, Error>;
-
-impl Leadership {
-    pub(crate) fn new(
-        persistence: Arc<Persistence>,
-        config: Config,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            persistence,
-            config,
-            cancel,
-        }
-    }
-
-    /// Find the current leader in the database and request it to step down if required.
-    /// Should be called early on in within the start-up sequence.
-    ///
-    /// Returns a tuple of two optionals: the current leader and its observed state
-    pub(crate) async fn step_down_current_leader(
-        &self,
-    ) -> Result<(Option<ControllerPersistence>, Option<GlobalObservedState>)> {
-        let leader = self.current_leader().await?;
-        let leader_step_down_state = if let Some(ref leader) = leader {
-            if self.config.start_as_candidate {
-                self.request_step_down(leader).await
-            } else {
-                None
-            }
-        } else {
-            tracing::info!("No leader found to request step down from. Will build observed state.");
-            None
-        };
-
-        Ok((leader, leader_step_down_state))
-    }
-
-    /// Mark the current storage controller instance as the leader in the database
-    pub(crate) async fn become_leader(
-        &self,
-        current_leader: Option<ControllerPersistence>,
-    ) -> Result<()> {
-        if let Some(address_for_peers) = &self.config.address_for_peers {
-            // TODO: `address-for-peers` can become a mandatory cli arg
-            // after we update the k8s setup
-            let proposed_leader = ControllerPersistence {
-                address: address_for_peers.to_string(),
-                started_at: chrono::Utc::now(),
-            };
-
-            self.persistence
-                .update_leader(current_leader, proposed_leader)
-                .await
-                .map_err(Error::Database)
-        } else {
-            tracing::info!("No address-for-peers provided. Skipping leader persistence.");
-            Ok(())
-        }
-    }
-
-    async fn current_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
-        let res = self.persistence.get_leader().await;
-        if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res
-        {
-            const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist";
-            if err.message().trim() == REL_NOT_FOUND_MSG {
-                // Special case: if this is a brand new storage controller, migrations will not
-                // have run at this point yet, and, hence, the controllers table does not exist.
-                // Detect this case via the error string (diesel doesn't type it) and allow it.
-                tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ...");
-                return Ok(None);
-            }
-        }
-
-        res
-    }
-
-    /// Request step down from the currently registered leader in the database
-    ///
-    /// If such an entry is persisted, the success path returns the observed
-    /// state and details of the leader. Otherwise, None is returned indicating
-    /// there is no leader currently.
-    async fn request_step_down(
-        &self,
-        leader: &ControllerPersistence,
-    ) -> Option<GlobalObservedState> {
-        tracing::info!("Sending step down request to {leader:?}");
-
-        let client = PeerClient::new(
-            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.peer_jwt_token.clone(),
-        );
-        let state = client.step_down(&self.cancel).await;
-        match state {
-            Ok(state) => Some(state),
-            Err(err) => {
-                // TODO: Make leaders periodically update a timestamp field in the
-                // database and, if the leader is not reachable from the current instance,
-                // but inferred as alive from the timestamp, abort start-up. This avoids
-                // a potential scenario in which we have two controllers acting as leaders.
-                tracing::error!(
-                    "Leader ({}) did not respond to step-down request: {}",
-                    leader.address,
-                    err
-                );
-
-                None
-            }
-        }
-    }
-}
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -8,7 +8,6 @@ mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
-mod leadership;
 pub mod metrics;
 mod node;
 mod pageserver_client;
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,5 +1,6 @@
 use anyhow::{anyhow, Context};
 use clap::Parser;
+use diesel::Connection;
 use hyper::Uri;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
@@ -26,6 +27,9 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

+use diesel_migrations::{embed_migrations, EmbeddedMigrations};
+pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
+
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -47,9 +51,6 @@ struct Cli {
    #[arg(long)]
    control_plane_jwt_token: Option<String>,

-    #[arg(long)]
-    peer_jwt_token: Option<String>,
-
    /// URL to control plane compute notification endpoint
    #[arg(long)]
    compute_hook_url: Option<String>,
@@ -129,28 +130,28 @@ struct Secrets {
    public_key: Option<JwtAuth>,
    jwt_token: Option<String>,
    control_plane_jwt_token: Option<String>,
-    peer_jwt_token: Option<String>,
 }

 impl Secrets {
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
-    const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";

    /// Load secrets from, in order of preference:
    /// - CLI args if database URL is provided on the CLI
    /// - Environment variables if DATABASE_URL is set.
+    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV)
+        let Some(database_url) =
+            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
        else {
            anyhow::bail!(
                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
            )
        };

-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) {
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
            None => None,
        };
@@ -158,18 +159,18 @@ impl Secrets {
        let this = Self {
            database_url,
            public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
            control_plane_jwt_token: Self::load_secret(
                &args.control_plane_jwt_token,
                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            ),
-            peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV),
+            )
+            .await,
        };

        Ok(this)
    }

-    fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
        if let Some(v) = cli {
            Some(v.clone())
        } else if let Ok(v) = std::env::var(env_name) {
@@ -180,6 +181,20 @@ impl Secrets {
    }
 }

+/// Execute the diesel migrations that are built into this binary
+async fn migration_run(database_url: &str) -> anyhow::Result<()> {
+    use diesel::PgConnection;
+    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
+    let mut conn = PgConnection::establish(database_url)?;
+
+    HarnessWithOutput::write_to_stdout(&mut conn)
+        .run_pending_migrations(MIGRATIONS)
+        .map(|_| ())
+        .map_err(|e| anyhow::anyhow!(e))?;
+
+    Ok(())
+}
+
 fn main() -> anyhow::Result<()> {
    logging::init(
        LogFormat::Plain,
@@ -269,7 +284,6 @@ async fn async_main() -> anyhow::Result<()> {
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
-        peer_jwt_token: secrets.peer_jwt_token,
        compute_hook_url: args.compute_hook_url,
        max_offline_interval: args
            .max_offline_interval
@@ -290,9 +304,13 @@ async fn async_main() -> anyhow::Result<()> {
        http_service_port: args.listen.port() as i32,
    };

-    // Validate that we can connect to the database
+    // After loading secrets & config, but before starting anything else, apply database migrations
    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;

+    migration_run(&secrets.database_url)
+        .await
+        .context("Running database migrations")?;
+
    let persistence = Arc::new(Persistence::new(secrets.database_url));

    let service = Service::spawn(config, persistence.clone()).await?;
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -230,7 +230,6 @@ pub(crate) enum DatabaseErrorLabel {
    Connection,
    ConnectionPool,
    Logical,
-    Migration,
 }

 impl DatabaseError {
@@ -240,7 +239,6 @@ impl DatabaseError {
            Self::Connection(_) => DatabaseErrorLabel::Connection,
            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
            Self::Logical(_) => DatabaseErrorLabel::Logical,
-            Self::Migration(_) => DatabaseErrorLabel::Migration,
        }
    }
 }
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -92,15 +92,15 @@ impl Node {
        }
    }

-    pub(crate) fn get_availability(&self) -> &NodeAvailability {
-        &self.availability
+    pub(crate) fn get_availability(&self) -> NodeAvailability {
+        self.availability
    }

    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
        use AvailabilityTransition::*;
        use NodeAvailability::WarmingUp;

-        match self.get_availability_transition(&availability) {
+        match self.get_availability_transition(availability) {
            ToActive => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
@@ -115,8 +115,8 @@ impl Node {
            Unchanged | ToWarmingUpFromOffline => {}
        }

-        if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
-            self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
+        if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
+            self.availability = WarmingUp(std::cmp::max(crnt, proposed));
        } else {
            self.availability = availability;
        }
@@ -126,12 +126,12 @@ impl Node {
    /// into a description of the transition.
    pub(crate) fn get_availability_transition(
        &self,
-        availability: &NodeAvailability,
+        availability: NodeAvailability,
    ) -> AvailabilityTransition {
        use AvailabilityTransition::*;
        use NodeAvailability::*;

-        match (&self.availability, availability) {
+        match (self.availability, availability) {
            (Offline, Active(_)) => ToActive,
            (Active(_), Offline) => ToOffline,
            (Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
@@ -153,15 +153,15 @@ impl Node {

    /// Is this node elegible to have work scheduled onto it?
    pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let utilization = match &self.availability {
-            NodeAvailability::Active(u) => u.clone(),
+        let score = match self.availability {
+            NodeAvailability::Active(score) => score,
            NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
        };

        match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
            NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
            NodeSchedulingPolicy::Pause => MaySchedule::No,
            NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
        }
@@ -285,7 +285,7 @@ impl Node {
    pub(crate) fn describe(&self) -> NodeDescribeResponse {
        NodeDescribeResponse {
            id: self.id,
-            availability: self.availability.clone().into(),
+            availability: self.availability.into(),
            scheduling: self.scheduling,
            listen_http_addr: self.listen_http_addr.clone(),
            listen_http_port: self.listen_http_port,
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -25,9 +25,6 @@ use crate::metrics::{
 };
 use crate::node::Node;

-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
-const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
-
 /// ## What do we store?
 ///
 /// The storage controller service does not store most of its state durably.
@@ -75,8 +72,6 @@ pub(crate) enum DatabaseError {
    ConnectionPool(#[from] r2d2::Error),
    #[error("Logical error: {0}")]
    Logical(String),
-    #[error("Migration error: {0}")]
-    Migration(String),
 }

 #[derive(measured::FixedCardinalityLabel, Copy, Clone)]
@@ -172,19 +167,6 @@ impl Persistence {
        }
    }

-    /// Execute the diesel migrations that are built into this binary
-    pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
-        use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            HarnessWithOutput::write_to_stdout(conn)
-                .run_pending_migrations(MIGRATIONS)
-                .map(|_| ())
-                .map_err(|e| DatabaseError::Migration(e.to_string()))
-        })
-        .await
-    }
-
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,6 +1,6 @@
 use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
-use pageserver_api::models::PageserverUtilization;
+use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,9 +20,9 @@ impl From<ScheduleError> for ApiError {
    }
 }

-#[derive(Serialize)]
+#[derive(Serialize, Eq, PartialEq)]
 pub enum MaySchedule {
-    Yes(PageserverUtilization),
+    Yes(UtilizationScore),
    No,
 }

@@ -282,28 +282,6 @@ impl Scheduler {
                node.shard_count -= 1;
            }
        }
-
-        // Maybe update PageserverUtilization
-        match update {
-            RefCountUpdate::AddSecondary | RefCountUpdate::Attach => {
-                // Referencing the node: if this takes our shard_count above the utilzation structure's
-                // shard count, then artifically bump it: this ensures that the scheduler immediately
-                // recognizes that this node has more work on it, without waiting for the next heartbeat
-                // to update the utilization.
-                if let MaySchedule::Yes(utilization) = &mut node.may_schedule {
-                    utilization.adjust_shard_count_max(node.shard_count as u32);
-                }
-            }
-            RefCountUpdate::PromoteSecondary
-            | RefCountUpdate::Detach
-            | RefCountUpdate::RemoveSecondary
-            | RefCountUpdate::DemoteAttached => {
-                // De-referencing the node: leave the utilization's shard_count at a stale higher
-                // value until some future heartbeat after we have physically removed this shard
-                // from the node: this prevents the scheduler over-optimistically trying to schedule
-                // more work onto the node before earlier detaches are done.
-            }
-        }
    }

    // Check if the number of shards attached to a given node is lagging below
@@ -348,18 +326,7 @@ impl Scheduler {
        use std::collections::hash_map::Entry::*;
        match self.nodes.entry(node.get_id()) {
            Occupied(mut entry) => {
-                // Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values
-                // to account for any shards scheduled on the controller but not yet visible to the pageserver.
-                let mut may_schedule = node.may_schedule();
-                match &mut may_schedule {
-                    MaySchedule::Yes(utilization) => {
-                        utilization.adjust_shard_count_max(entry.get().shard_count as u32);
-                    }
-                    MaySchedule::No => { // Nothing to tweak
-                    }
-                }
-
-                entry.get_mut().may_schedule = may_schedule;
+                entry.get_mut().may_schedule = node.may_schedule();
            }
            Vacant(entry) => {
                entry.insert(SchedulerNode {
@@ -396,7 +363,7 @@ impl Scheduler {
                let may_schedule = self
                    .nodes
                    .get(node_id)
-                    .map(|n| !matches!(n.may_schedule, MaySchedule::No))
+                    .map(|n| n.may_schedule != MaySchedule::No)
                    .unwrap_or(false);
                (*node_id, may_schedule)
            })
@@ -416,7 +383,7 @@ impl Scheduler {
    /// the same tenant on the same node.  This is a soft constraint: the context will never
    /// cause us to fail to schedule a shard.
    pub(crate) fn schedule_shard(
-        &mut self,
+        &self,
        hard_exclude: &[NodeId],
        context: &ScheduleContext,
    ) -> Result<NodeId, ScheduleError> {
@@ -424,41 +391,31 @@ impl Scheduler {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
            .nodes
-            .iter_mut()
-            .filter_map(|(k, v)| match &mut v.may_schedule {
-                MaySchedule::No => None,
-                MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
-                MaySchedule::Yes(utilization) => Some((
-                    *k,
-                    context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                    utilization.cached_score(),
-                    v.attached_shard_count,
-                )),
+            .iter()
+            .filter_map(|(k, v)| {
+                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
+                    None
+                } else {
+                    Some((
+                        *k,
+                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
+                        v.shard_count,
+                        v.attached_shard_count,
+                    ))
+                }
            })
            .collect();

-        // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
-        // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
-        // we may place shards in the same tenant together on the same pageserver if all other pageservers are
-        // overloaded.
-        let non_overloaded_scores = scores
-            .iter()
-            .filter(|i| !PageserverUtilization::is_overloaded(i.2))
-            .copied()
-            .collect::<Vec<_>>();
-        if !non_overloaded_scores.is_empty() {
-            scores = non_overloaded_scores;
-        }
-
        // Sort by, in order of precedence:
        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization score (this combines shard count and disk utilization)
-        //  3rd: Attached shard count.  When nodes have identical utilization (e.g. when populating some
-        //       empty nodes), this acts as an anti-affinity between attached shards.
+        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
+        //  the least number of attached shards.
+        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
+        //  with the lower total shard count.
        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
+        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));

        if scores.is_empty() {
            // After applying constraints, no pageservers were left.
@@ -472,7 +429,7 @@ impl Scheduler {
                for (node_id, node) in &self.nodes {
                    tracing::info!(
                        "Node {node_id}: may_schedule={} shards={}",
-                        !matches!(node.may_schedule, MaySchedule::No),
+                        node.may_schedule != MaySchedule::No,
                        node.shard_count
                    );
                }
@@ -512,7 +469,7 @@ impl Scheduler {
 pub(crate) mod test_utils {

    use crate::node::Node;
-    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -529,7 +486,7 @@ pub(crate) mod test_utils {
                        format!("pghost-{i}"),
                        5432 + i as u16,
                    );
-                    node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                    assert!(node.is_available());
                    node
                })
@@ -540,8 +497,6 @@ pub(crate) mod test_utils {

 #[cfg(test)]
 mod tests {
-    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
-
    use super::*;

    use crate::tenant_shard::IntentState;
@@ -602,130 +557,4 @@ mod tests {

        Ok(())
    }
-
-    #[test]
-    /// Test the PageserverUtilization's contribution to scheduling algorithm
-    fn scheduler_utilization() {
-        let mut nodes = test_utils::make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        // Need to keep these alive because they contribute to shard counts via RAII
-        let mut scheduled_intents = Vec::new();
-
-        let empty_context = ScheduleContext::default();
-
-        fn assert_scheduler_chooses(
-            expect_node: NodeId,
-            scheduled_intents: &mut Vec<IntentState>,
-            scheduler: &mut Scheduler,
-            context: &ScheduleContext,
-        ) {
-            let scheduled = scheduler.schedule_shard(&[], context).unwrap();
-            let mut intent = IntentState::new();
-            intent.set_attached(scheduler, Some(scheduled));
-            scheduled_intents.push(intent);
-            assert_eq!(scheduled, expect_node);
-        }
-
-        // Independent schedule calls onto empty nodes should round-robin, because each node's
-        // utilization's shard count is updated inline.  The order is determinsitic because when all other factors are
-        // equal, we order by node ID.
-        assert_scheduler_chooses(
-            NodeId(1),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &empty_context,
-        );
-        assert_scheduler_chooses(
-            NodeId(2),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &empty_context,
-        );
-        assert_scheduler_chooses(
-            NodeId(3),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &empty_context,
-        );
-
-        // Manually setting utilization higher should cause schedule calls to round-robin the other nodes
-        // which have equal utilization.
-        nodes
-            .get_mut(&NodeId(1))
-            .unwrap()
-            .set_availability(NodeAvailability::Active(test_utilization::simple(
-                10,
-                1024 * 1024 * 1024,
-            )));
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-
-        assert_scheduler_chooses(
-            NodeId(2),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &empty_context,
-        );
-        assert_scheduler_chooses(
-            NodeId(3),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &empty_context,
-        );
-        assert_scheduler_chooses(
-            NodeId(2),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &empty_context,
-        );
-        assert_scheduler_chooses(
-            NodeId(3),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &empty_context,
-        );
-
-        // The scheduler should prefer nodes with lower affinity score,
-        // even if they have higher utilization (as long as they aren't utilized at >100%)
-        let mut context_prefer_node1 = ScheduleContext::default();
-        context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]);
-        assert_scheduler_chooses(
-            NodeId(1),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &context_prefer_node1,
-        );
-        assert_scheduler_chooses(
-            NodeId(1),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &context_prefer_node1,
-        );
-
-        // If a node is over-utilized, it will not be used even if affinity scores prefer it
-        nodes
-            .get_mut(&NodeId(1))
-            .unwrap()
-            .set_availability(NodeAvailability::Active(test_utilization::simple(
-                20000,
-                1024 * 1024 * 1024,
-            )));
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        assert_scheduler_chooses(
-            NodeId(2),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &context_prefer_node1,
-        );
-        assert_scheduler_chooses(
-            NodeId(3),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &context_prefer_node1,
-        );
-
-        for mut intent in scheduled_intents {
-            intent.clear(&mut scheduler);
-        }
-    }
 }
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -17,9 +17,8 @@ use crate::{
    compute_hook::NotifyError,
    drain_utils::{self, TenantShardDrain, TenantShardIterator},
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
-    leadership::Leadership,
    metrics,
-    peer_client::GlobalObservedState,
+    peer_client::{GlobalObservedState, PeerClient},
    persistence::{
        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
        TenantFilter,
@@ -44,7 +43,7 @@ use pageserver_api::{
        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -288,9 +287,6 @@ pub struct Config {
    // This JWT token will be used to authenticate this service to the control plane.
    pub control_plane_jwt_token: Option<String>,

-    // This JWT token will be used to authenticate with other storage controller instances
-    pub peer_jwt_token: Option<String>,
-
    /// Where the compute hook should send notifications of pageserver attachment locations
    /// (this URL points to the control plane in prod). If this is None, the compute hook will
    /// assume it is running in a test environment and try to update neon_local.
@@ -337,7 +333,7 @@ impl From<DatabaseError> for ApiError {
            DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
                ApiError::ShuttingDown
            }
-            DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => {
+            DatabaseError::Logical(reason) => {
                ApiError::InternalServerError(anyhow::anyhow!(reason))
            }
        }
@@ -542,7 +538,7 @@ impl Service {
            let locked = self.inner.read().unwrap();
            locked.nodes.clone()
        };
-        let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
+        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;

        // List of tenants for which we will attempt to notify compute of their location at startup
        let mut compute_notifications = Vec::new();
@@ -556,8 +552,10 @@ impl Service {
            // Mark nodes online if they responded to us: nodes are offline by default after a restart.
            let mut new_nodes = (**nodes).clone();
            for (node_id, node) in new_nodes.iter_mut() {
-                if let Some(utilization) = nodes_online.remove(node_id) {
-                    node.set_availability(NodeAvailability::Active(utilization));
+                if let Some(utilization) = nodes_online.get(node_id) {
+                    node.set_availability(NodeAvailability::Active(UtilizationScore(
+                        utilization.utilization_score,
+                    )));
                    scheduler.node_upsert(node);
                }
            }
@@ -608,15 +606,22 @@ impl Service {

        // Before making any obeservable changes to the cluster, persist self
        // as leader in database and memory.
-        let leadership = Leadership::new(
-            self.persistence.clone(),
-            self.config.clone(),
-            self.cancel.child_token(),
-        );
+        if let Some(address_for_peers) = &self.config.address_for_peers {
+            // TODO: `address-for-peers` can become a mandatory cli arg
+            // after we update the k8s setup
+            let proposed_leader = ControllerPersistence {
+                address: address_for_peers.to_string(),
+                started_at: chrono::Utc::now(),
+            };

-        if let Err(e) = leadership.become_leader(current_leader).await {
-            tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ...");
-            std::process::exit(1);
+            if let Err(err) = self
+                .persistence
+                .update_leader(current_leader, proposed_leader)
+                .await
+            {
+                tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
+                std::process::exit(1);
+            }
        }

        self.inner.write().unwrap().become_leader();
@@ -923,9 +928,9 @@ impl Service {
            if let Ok(deltas) = res {
                for (node_id, state) in deltas.0 {
                    let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => {
-                            NodeAvailability::Active(utilization)
-                        }
+                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
+                            UtilizationScore(utilization.utilization_score),
+                        ),
                        PageserverState::WarmingUp { started_at } => {
                            NodeAvailability::WarmingUp(started_at)
                        }
@@ -934,17 +939,14 @@ impl Service {
                            // while the heartbeat round was on-going. Hence, filter out
                            // offline transitions for WarmingUp nodes that are still within
                            // their grace period.
-                            if let Ok(NodeAvailability::WarmingUp(started_at)) = self
-                                .get_node(node_id)
-                                .await
-                                .as_ref()
-                                .map(|n| n.get_availability())
+                            if let Ok(NodeAvailability::WarmingUp(started_at)) =
+                                self.get_node(node_id).await.map(|n| n.get_availability())
                            {
                                let now = Instant::now();
-                                if now - *started_at >= self.config.max_warming_up_interval {
+                                if now - started_at >= self.config.max_warming_up_interval {
                                    NodeAvailability::Offline
                                } else {
-                                    NodeAvailability::WarmingUp(*started_at)
+                                    NodeAvailability::WarmingUp(started_at)
                                }
                            } else {
                                NodeAvailability::Offline
@@ -1157,16 +1159,6 @@ impl Service {
        let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
        let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();

-        let leadership_cancel = CancellationToken::new();
-        let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel);
-        let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?;
-
-        // Apply the migrations **after** the current leader has stepped down
-        // (or we've given up waiting for it), but **before** reading from the
-        // database. The only exception is reading the current leader before
-        // migrating.
-        persistence.migration_run().await?;
-
        tracing::info!("Loading nodes from database...");
        let nodes = persistence
            .list_nodes()
@@ -1384,6 +1376,32 @@ impl Service {
                    return;
                };

+                let leadership_status = this.inner.read().unwrap().get_leadership_status();
+                let leader = match this.get_leader().await {
+                    Ok(ok) => ok,
+                    Err(err) => {
+                        tracing::error!(
+                            "Failed to query database for current leader: {err}. Aborting start-up ..."
+                        );
+                        std::process::exit(1);
+                    }
+                };
+
+                let leader_step_down_state = match leadership_status {
+                    LeadershipStatus::Candidate => {
+                        if let Some(ref leader) = leader {
+                            this.request_step_down(leader).await
+                        } else {
+                            tracing::info!(
+                                "No leader found to request step down from. Will build observed state."
+                            );
+                            None
+                        }
+                    }
+                    LeadershipStatus::Leader => None,
+                    LeadershipStatus::SteppedDown => unreachable!(),
+                };
+
                this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                    .await;

@@ -1626,7 +1644,7 @@ impl Service {
        // This Node is a mutable local copy: we will set it active so that we can use its
        // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
        // later.
-        node.set_availability(NodeAvailability::Active(PageserverUtilization::full()));
+        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));

        let configs = match node
            .with_client_retries(
@@ -2474,7 +2492,7 @@ impl Service {
        .await;

        let node = {
-            let mut locked = self.inner.write().unwrap();
+            let locked = self.inner.read().unwrap();
            // Just a sanity check to prevent misuse: the API expects that the tenant is fully
            // detached everywhere, and nothing writes to S3 storage. Here, we verify that,
            // but only at the start of the process, so it's really just to prevent operator
@@ -2501,7 +2519,7 @@ impl Service {
                    return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
                }
            }
-            let scheduler = &mut locked.scheduler;
+            let scheduler = &locked.scheduler;
            // Right now we only perform the operation on a single node without parallelization
            // TODO fan out the operation to multiple nodes for better performance
            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
@@ -4762,7 +4780,7 @@ impl Service {
        //
        // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
        // nothing else can mutate its availability while we run.
-        let availability_transition = if let Some(input_availability) = availability.as_ref() {
+        let availability_transition = if let Some(input_availability) = availability {
            let (activate_node, availability_transition) = {
                let locked = self.inner.read().unwrap();
                let Some(node) = locked.nodes.get(&node_id) else {
@@ -4798,8 +4816,8 @@ impl Service {
            ));
        };

-        if let Some(availability) = availability.as_ref() {
-            node.set_availability(availability.clone());
+        if let Some(availability) = &availability {
+            node.set_availability(*availability);
        }

        if let Some(scheduling) = scheduling {
@@ -6359,4 +6377,42 @@ impl Service {

        global_observed
    }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    ///
+    /// On failures to query the database or step down error responses the process is killed
+    /// and we rely on k8s to retry.
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        // TODO: jwt token
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
+            Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
+                tracing::error!(
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
+                );
+
+                None
+            }
+        }
+    }
 }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -779,7 +779,7 @@ impl TenantShard {
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn optimize_secondary(
        &self,
-        scheduler: &mut Scheduler,
+        scheduler: &Scheduler,
        schedule_context: &ScheduleContext,
    ) -> Option<ScheduleOptimization> {
        if self.intent.secondary.is_empty() {
@@ -1595,7 +1595,7 @@ pub(crate) mod tests {
        schedule_context.avoid(&shard_b.intent.all_pageservers());
        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());

-        let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
+        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);

        // Since there is a node with no locations available, the node with two locations for the
        // same tenant should generate an optimization to move one away
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,10 +1,10 @@
 use std::collections::{HashMap, HashSet};

 use anyhow::Context;
+use aws_sdk_s3::Client;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
-use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -16,7 +16,7 @@ use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
+use remote_storage::RemotePath;

 pub(crate) struct TimelineAnalysis {
    /// Anomalies detected
@@ -48,12 +48,13 @@ impl TimelineAnalysis {
 }

 pub(crate) async fn branch_cleanup_and_check_errors(
-    remote_client: &GenericRemoteStorage,
+    s3_client: &Client,
+    target: &RootTarget,
    id: &TenantShardTimelineId,
    tenant_objects: &mut TenantObjectListing,
    s3_active_branch: Option<&BranchData>,
    console_branch: Option<BranchData>,
-    s3_data: Option<RemoteTimelineBlobData>,
+    s3_data: Option<S3TimelineBlobData>,
 ) -> TimelineAnalysis {
    let mut result = TimelineAnalysis::new();

@@ -77,9 +78,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(

    match s3_data {
        Some(s3_data) => {
-            result
-                .garbage_keys
-                .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
+            result.garbage_keys.extend(s3_data.unknown_keys);

            match s3_data.blob_data {
                BlobDataParseResult::Parsed {
@@ -144,13 +143,16 @@ pub(crate) async fn branch_cleanup_and_check_errors(

                            // HEAD request used here to address a race condition  when an index was uploaded concurrently
                            // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
-                            let response = remote_client
-                                .head_object(&path, &CancellationToken::new())
+                            let response = s3_client
+                                .head_object()
+                                .bucket(target.bucket_name())
+                                .key(path.get_path().as_str())
+                                .send()
                                .await;

                            if response.is_err() {
                                // Object is not present.
-                                let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
+                                let is_l0 = LayerMap::is_l0(layer.key_range());

                                let msg = format!(
                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
@@ -282,14 +284,14 @@ impl TenantObjectListing {
 }

 #[derive(Debug)]
-pub(crate) struct RemoteTimelineBlobData {
+pub(crate) struct S3TimelineBlobData {
    pub(crate) blob_data: BlobDataParseResult,

    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
-    pub(crate) unused_index_keys: Vec<ListingObject>,
+    pub(crate) unused_index_keys: Vec<String>,

    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
-    pub(crate) unknown_keys: Vec<ListingObject>,
+    pub(crate) unknown_keys: Vec<String>,
 }

 #[derive(Debug)]
@@ -321,37 +323,31 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
 }

 pub(crate) async fn list_timeline_blobs(
-    remote_client: &GenericRemoteStorage,
+    s3_client: &Client,
    id: TenantShardTimelineId,
-    root_target: &RootTarget,
-) -> anyhow::Result<RemoteTimelineBlobData> {
+    s3_root: &RootTarget,
+) -> anyhow::Result<S3TimelineBlobData> {
    let mut s3_layers = HashSet::new();

    let mut errors = Vec::new();
    let mut unknown_keys = Vec::new();

-    let mut timeline_dir_target = root_target.timeline_root(&id);
+    let mut timeline_dir_target = s3_root.timeline_root(&id);
    timeline_dir_target.delimiter = String::new();

-    let mut index_part_keys: Vec<ListingObject> = Vec::new();
+    let mut index_part_keys: Vec<String> = Vec::new();
    let mut initdb_archive: bool = false;

-    let prefix_str = &timeline_dir_target
-        .prefix_in_bucket
-        .strip_prefix("/")
-        .unwrap_or(&timeline_dir_target.prefix_in_bucket);
-
-    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
    while let Some(obj) = stream.next().await {
-        let (key, Some(obj)) = obj? else {
-            panic!("ListingObject not specified");
-        };
+        let obj = obj?;
+        let key = obj.key();

-        let blob_name = key.get_path().as_str().strip_prefix(prefix_str);
+        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
        match blob_name {
            Some(name) if name.starts_with("index_part.json") => {
                tracing::debug!("Index key {key}");
-                index_part_keys.push(obj)
+                index_part_keys.push(key.to_owned())
            }
            Some("initdb.tar.zst") => {
                tracing::debug!("initdb archive {key}");
@@ -362,7 +358,7 @@ pub(crate) async fn list_timeline_blobs(
            }
            Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                Ok((new_layer, gen)) => {
-                    tracing::debug!("Parsed layer key: {new_layer} {gen:?}");
+                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
                    s3_layers.insert((new_layer, gen));
                }
                Err(e) => {
@@ -370,13 +366,13 @@ pub(crate) async fn list_timeline_blobs(
                    errors.push(
                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                    );
-                    unknown_keys.push(obj);
+                    unknown_keys.push(key.to_string());
                }
            },
            None => {
-                tracing::warn!("Unknown key {key}");
+                tracing::warn!("Unknown key {}", key);
                errors.push(format!("S3 list response got an object with odd key {key}"));
-                unknown_keys.push(obj);
+                unknown_keys.push(key.to_string());
            }
        }
    }
@@ -385,7 +381,7 @@ pub(crate) async fn list_timeline_blobs(
        tracing::debug!(
            "Timeline is empty apart from initdb archive: expected post-deletion state."
        );
-        return Ok(RemoteTimelineBlobData {
+        return Ok(S3TimelineBlobData {
            blob_data: BlobDataParseResult::Relic,
            unused_index_keys: index_part_keys,
            unknown_keys: Vec::new(),
@@ -399,13 +395,13 @@ pub(crate) async fn list_timeline_blobs(
            // Stripping the index key to the last part, because RemotePath doesn't
            // like absolute paths, and depending on prefix_in_bucket it's possible
            // for the keys we read back to start with a slash.
-            let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1;
+            let basename = key.rsplit_once('/').unwrap().1;
            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
        })
        .max_by_key(|i| i.1)
        .map(|(k, g)| (k.clone(), g))
    {
-        Some((key, gen)) => (Some::<ListingObject>(key.to_owned()), gen),
+        Some((key, gen)) => (Some(key), gen),
        None => {
            // Legacy/missing case: one or zero index parts, which did not have a generation
            (index_part_keys.pop(), Generation::none())
@@ -420,14 +416,17 @@ pub(crate) async fn list_timeline_blobs(
    }

    if let Some(index_part_object_key) = index_part_object.as_ref() {
-        let index_part_bytes =
-            download_object_with_retries(remote_client, &index_part_object_key.key)
-                .await
-                .context("index_part.json download")?;
+        let index_part_bytes = download_object_with_retries(
+            s3_client,
+            &timeline_dir_target.bucket_name,
+            index_part_object_key,
+        )
+        .await
+        .context("index_part.json download")?;

        match serde_json::from_slice(&index_part_bytes) {
            Ok(index_part) => {
-                return Ok(RemoteTimelineBlobData {
+                return Ok(S3TimelineBlobData {
                    blob_data: BlobDataParseResult::Parsed {
                        index_part: Box::new(index_part),
                        index_part_generation,
@@ -449,7 +448,7 @@ pub(crate) async fn list_timeline_blobs(
        );
    }

-    Ok(RemoteTimelineBlobData {
+    Ok(S3TimelineBlobData {
        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
        unused_index_keys: index_part_keys,
        unknown_keys,
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -6,7 +6,7 @@ use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};

 use crate::{
-    checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants,
+    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
    stream_objects_with_retries, BucketConfig, NodeKind,
 };

@@ -50,8 +50,9 @@ pub async fn find_large_objects(
    ignore_deltas: bool,
    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants(&remote_client, &target));
+    let (remote_client, target) =
+        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = pin!(stream_tenants_generic(&remote_client, &target));

    let objects_stream = tenants.map_ok(|tenant_shard_id| {
        let mut tenant_root = target.tenant_root(&tenant_shard_id);
--- a/Show More
+++ b/Show More