Merge pull request #8858 from neondatabase/releases/2024-08-28-compute-only

Compute release 2024-08-28
fix(sql-exporter): Remove tenant_id from compute_logical_snapshot_files
2026-01-24 13:50:37 +00:00 · 2024-08-28 20:00:51 +01:00 · 2024-08-27 00:51:23 +02:00 · 2024-08-27 00:51:23 +02:00 · 2024-08-26 14:19:47 -04:00 · 2024-08-26 18:07:55 +02:00
194 changed files with 4621 additions and 2556 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,10 +23,30 @@ platforms = [
 ]

 [final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
+workspace-members = [
+    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+    # from depending on workspace-hack because most of the dependencies are not used.
+    "vm_monitor",
+    # All of these exist in libs and are not usually built independently.
+    # Putting workspace hack there adds a bottleneck for cargo builds.
+    "compute_api",
+    "consumption_metrics",
+    "desim",
+    "metrics",
+    "pageserver_api",
+    "postgres_backend",
+    "postgres_connection",
+    "postgres_ffi",
+    "pq_proto",
+    "remote_storage",
+    "safekeeper_api",
+    "tenant_size_model",
+    "tracing-utils",
+    "utils",
+    "wal_craft",
+    "walproposer",
+]

 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -43,7 +43,7 @@ inputs:
  pg_version:
    description: 'Postgres version to use for tests'
    required: false
-    default: 'v14'
+    default: 'v16'
  benchmark_durations:
    description: 'benchmark durations JSON'
    required: false
@@ -169,10 +169,8 @@ runs:
          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
        fi

-        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
+        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
-          cov_prefix=()
        else
          cov_prefix=()
        fi
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -48,6 +48,8 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  

+    - uses: actions/checkout@v4
+
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,11 +94,16 @@ jobs:
      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
      # corresponding Cargo.toml files for their descriptions.
      - name: Set env variables
+        env:
+          ARCH: ${{ inputs.arch }}
        run: |
          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
            CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked"
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
@@ -158,6 +163,8 @@ jobs:
      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
      - name: Install rust binaries
+        env:
+          ARCH: ${{ inputs.arch }}
        run: |
          # Install target binaries
          mkdir -p /tmp/neon/bin/
@@ -172,7 +179,7 @@ jobs:
          done

          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
            # Keep bloated coverage data files away from the rest of the artifact
            mkdir -p /tmp/coverage/

@@ -243,8 +250,8 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    # Run test on x64 only
-    if: inputs.arch == 'x64'
+    # Don't run regression tests on debug arm64 builds
+    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        arch: [ x64 ]
+        arch: [ x64, arm64 ]
        # Do not build or run tests in debug for release branches
        build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
        include:
@@ -280,6 +280,7 @@ jobs:
          save_perf_report: ${{ github.ref_name == 'main' }}
          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
+          pg_version: v16
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -985,10 +986,10 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
@@ -998,14 +999,14 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f deployStorage=false \
@@ -1015,7 +1016,7 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f branch=main \
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1208,7 +1208,6 @@ dependencies = [
 "serde_json",
 "serde_with",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -1321,7 +1320,6 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -1670,14 +1668,13 @@ dependencies = [
 "smallvec",
 "tracing",
 "utils",
- "workspace_hack",
 ]

 [[package]]
 name = "diesel"
-version = "2.2.1"
+version = "2.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
+checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
@@ -3147,7 +3144,6 @@ dependencies = [
 "rand 0.8.5",
 "rand_distr",
 "twox-hash",
- "workspace_hack",
 ]

 [[package]]
@@ -3791,7 +3787,6 @@ dependencies = [
 "strum_macros",
 "thiserror",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -4193,7 +4188,6 @@ dependencies = [
 "tokio-rustls 0.25.0",
 "tokio-util",
 "tracing",
- "workspace_hack",
 ]

 [[package]]
@@ -4206,7 +4200,6 @@ dependencies = [
 "postgres",
 "tokio-postgres",
 "url",
- "workspace_hack",
 ]

 [[package]]
@@ -4229,7 +4222,6 @@ dependencies = [
 "serde",
 "thiserror",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -4267,7 +4259,6 @@ dependencies = [
 "thiserror",
 "tokio",
 "tracing",
- "workspace_hack",
 ]

 [[package]]
@@ -4832,7 +4823,6 @@ dependencies = [
 "toml_edit 0.19.10",
 "tracing",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -5357,7 +5347,6 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -5601,11 +5590,12 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.96"
+version = "1.0.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
+checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
 dependencies = [
 "itoa",
+ "memchr",
 "ryu",
 "serde",
 ]
@@ -6193,7 +6183,6 @@ dependencies = [
 "anyhow",
 "serde",
 "serde_json",
- "workspace_hack",
 ]

 [[package]]
@@ -6794,7 +6783,6 @@ dependencies = [
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
- "workspace_hack",
 ]

 [[package]]
@@ -7012,7 +7000,6 @@ dependencies = [
 "url",
 "uuid",
 "walkdir",
- "workspace_hack",
 ]

 [[package]]
@@ -7091,7 +7078,6 @@ dependencies = [
 "postgres_ffi",
 "regex",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -7112,7 +7098,6 @@ dependencies = [
 "bindgen",
 "postgres_ffi",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -7669,8 +7654,6 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
- "toml_datetime",
- "toml_edit 0.19.10",
 "tonic",
 "tower",
 "tracing",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,7 +113,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.8"
-nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.


 #### Running neon database
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: &str = "15";
+const DEFAULT_PG_VERSION: &str = "16";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;

 //
 // This data structures represents neon_local CLI config
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -217,7 +217,7 @@ impl StorageController {
        Ok(exitcode.success())
    }

-    /// Create our database if it doesn't exist, and run migrations.
+    /// Create our database if it doesn't exist
    ///
    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
@@ -382,7 +382,6 @@ impl StorageController {
            )
            .await?;

-            // Run migrations on every startup, in case something changed.
            self.setup_database(postgres_port).await?;
        }

@@ -454,6 +453,11 @@ impl StorageController {
            let jwt_token =
                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
            args.push(format!("--jwt-token={jwt_token}"));
+
+            let peer_claims = Claims::new(None, Scope::Admin);
+            let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
+                .expect("failed to generate jwt token");
+            args.push(format!("--peer-jwt-token={peer_jwt_token}"));
        }

        if let Some(public_key) = &self.public_key {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -147,9 +147,9 @@ enum Command {
        #[arg(long)]
        threshold: humantime::Duration,
    },
-    // Drain a set of specified pageservers by moving the primary attachments to pageservers
+    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
    // outside of the specified set.
-    Drain {
+    BulkMigrate {
        // Set of pageserver node ids to drain.
        #[arg(long)]
        nodes: Vec<NodeId>,
@@ -163,6 +163,34 @@ enum Command {
        #[arg(long)]
        dry_run: Option<bool>,
    },
+    /// Start draining the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartDrain {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel draining the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelDrain {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
+    /// Start filling the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartFill {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel filling the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelFill {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
 }

 #[derive(Parser)]
@@ -249,6 +277,34 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

+async fn wait_for_scheduling_policy<F>(
+    client: Client,
+    node_id: NodeId,
+    timeout: Duration,
+    f: F,
+) -> anyhow::Result<NodeSchedulingPolicy>
+where
+    F: Fn(NodeSchedulingPolicy) -> bool,
+{
+    let waiter = tokio::time::timeout(timeout, async move {
+        loop {
+            let node = client
+                .dispatch::<(), NodeDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/node/{node_id}"),
+                    None,
+                )
+                .await?;
+
+            if f(node.scheduling) {
+                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
+            }
+        }
+    });
+
+    Ok(waiter.await??)
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -628,7 +684,7 @@ async fn main() -> anyhow::Result<()> {
                })
                .await?;
        }
-        Command::Drain {
+        Command::BulkMigrate {
            nodes,
            concurrency,
            max_shards,
@@ -657,7 +713,7 @@ async fn main() -> anyhow::Result<()> {
            }

            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Drain requested for node which doesn't exist.")
+                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
            }

            node_to_fill_descs.retain(|desc| {
@@ -669,7 +725,7 @@ async fn main() -> anyhow::Result<()> {
            });

            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to drain to")
+                anyhow::bail!("There are no nodes to migrate to")
            }

            // Set the node scheduling policy to draining for the nodes which
@@ -690,7 +746,7 @@ async fn main() -> anyhow::Result<()> {
                    .await?;
            }

-            // Perform the drain: move each tenant shard scheduled on a node to
+            // Perform the migration: move each tenant shard scheduled on a node to
            // be drained to a node which is being filled. A simple round robin
            // strategy is used to pick the new node.
            let tenants = storcon_client
@@ -703,13 +759,13 @@ async fn main() -> anyhow::Result<()> {

            let mut selected_node_idx = 0;

-            struct DrainMove {
+            struct MigrationMove {
                tenant_shard_id: TenantShardId,
                from: NodeId,
                to: NodeId,
            }

-            let mut moves: Vec<DrainMove> = Vec::new();
+            let mut moves: Vec<MigrationMove> = Vec::new();

            let shards = tenants
                .into_iter()
@@ -739,7 +795,7 @@ async fn main() -> anyhow::Result<()> {
                    continue;
                }

-                moves.push(DrainMove {
+                moves.push(MigrationMove {
                    tenant_shard_id: shard.tenant_shard_id,
                    from: shard
                        .node_attached
@@ -816,6 +872,67 @@ async fn main() -> anyhow::Result<()> {
                failure
            );
        }
+        Command::StartDrain { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::PUT,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+            println!("Drain started for {node_id}");
+        }
+        Command::CancelDrain { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active | PauseForRestart)
+                })
+                .await?;
+
+            println!(
+                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
+        Command::StartFill { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
+                .await?;
+
+            println!("Fill started for {node_id}");
+        }
+        Command::CancelFill { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/fill"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active)
+                })
+                .await?;
+
+            println!(
+                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
    }

    Ok(())
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.

 Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.

 This problem is not yet very acutely felt in storage controller managed pageservers since
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.

    ```shell
-    git checkout -b my-branch REL_15_STABLE_neon
+    git checkout -b my-branch-15 REL_15_STABLE_neon
    ```

-1. Tag the last commit on the stable branch you are updating.
+1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.

-    ```shell
-    git tag REL_15_3_neon
-    ```
-
-1. Push the new tag to the Neon Postgres repository.
-
-    ```shell
-    git push origin REL_15_3_neon
-    ```
-
-1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
-
-1. Rebase the branch you created on the tag and resolve any conflicts.
+1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.

    ```shell
    git fetch upstream REL_15_4
-    git rebase REL_15_4
+    git merge REL_15_4
    ```

+    In the commit message of the merge commit, mention if there were
+    any non-trivial conflicts or other issues.
+
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.

@@ -57,7 +48,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.

    ```shell
-    git push origin my-branch
+    git push origin my-branch-15
    ```

 1. Clone the Neon repository if you have not done so already.
@@ -74,7 +65,7 @@ branch.
 1. Update the Git submodule.

    ```shell
-    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
    git submodule update --remote vendor/postgres-v15
    ```

@@ -89,14 +80,12 @@ minor Postgres release.

 1. Create a pull request, and wait for CI to go green.

-1. Force push the rebased Postgres branches into the Neon Postgres repository.
+1. Push the Postgres branches with the merge commits into the Neon Postgres repository.

    ```shell
-    git push --force origin my-branch:REL_15_STABLE_neon
+    git push origin my-branch-15:REL_15_STABLE_neon
    ```

-    It may require disabling various branch protections.
-
 1. Update your Neon PR to point at the branches.

    ```shell
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,5 +14,3 @@ regex.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
-
-workspace_hack.workspace = true
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,10 +6,8 @@ license = "Apache-2.0"

 [dependencies]
 anyhow.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,5 +14,3 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
-
-workspace_hack.workspace = true
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,8 +12,6 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true

-workspace_hack.workspace = true
-
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,11 +21,9 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};

+use crate::models::PageserverUtilization;
 use crate::{
    models::{ShardParameters, TenantConfig},
    shard::{ShardStripeSize, TenantShardId},
@@ -140,23 +141,11 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-/// Utilisation score indicating how good a candidate a pageserver
-/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
-/// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
-pub struct UtilizationScore(pub u64);
-
-impl UtilizationScore {
-    pub fn worst() -> Self {
-        UtilizationScore(u64::MAX)
-    }
-}
-
-#[derive(Serialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(UtilizationScore),
+    Active(PageserverUtilization),
    // Node is warming up, but we expect it to become available soon. Covers
    // the time span between the re-attach response being composed on the storage controller
    // and the first successful heartbeat after the processing of the re-attach response
@@ -195,7 +184,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
        match val {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
-            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::Active => {
+                NodeAvailability::Active(PageserverUtilization::full())
+            }
            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -236,6 +236,15 @@ impl Key {
        field5: u8::MAX,
        field6: u32::MAX,
    };
+    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
+    pub const NON_L0_MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX - 1,
+    };

    pub fn from_hex(s: &str) -> Result<Self> {
        if s.len() != 36 {
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -348,7 +348,7 @@ impl AuxFilePolicy {

    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
    pub fn default_tenant_config() -> Self {
-        Self::V1
+        Self::V2
    }
 }

@@ -718,6 +718,7 @@ pub struct TimelineInfo {
    pub pg_version: u32,

    pub state: TimelineState,
+    pub is_archived: bool,

    pub walreceiver_status: String,

--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
    pub max_shard_count: u32,

    /// Cached result of [`Self::score`]
-    pub utilization_score: u64,
+    pub utilization_score: Option<u64>,

    /// When was this snapshot captured, pageserver local time.
    ///
@@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
    Percent::new(0).unwrap()
 }

+pub type RawScore = u64;
+
 impl PageserverUtilization {
    const UTILIZATION_FULL: u64 = 1000000;

@@ -62,7 +64,7 @@ impl PageserverUtilization {
    /// - Negative values are forbidden
    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
    ///   layer eviction.
-    pub fn score(&self) -> u64 {
+    pub fn score(&self) -> RawScore {
        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
            * self.disk_usable_pct.get() as u64)
            / 100;
@@ -74,8 +76,30 @@ impl PageserverUtilization {
        std::cmp::max(disk_utilization_score, shard_utilization_score)
    }

-    pub fn refresh_score(&mut self) {
-        self.utilization_score = self.score();
+    pub fn cached_score(&mut self) -> RawScore {
+        match self.utilization_score {
+            None => {
+                let s = self.score();
+                self.utilization_score = Some(s);
+                s
+            }
+            Some(s) => s,
+        }
+    }
+
+    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
+    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
+    pub fn is_overloaded(score: RawScore) -> bool {
+        score >= Self::UTILIZATION_FULL
+    }
+
+    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
+        if self.shard_count < shard_count {
+            self.shard_count = shard_count;
+
+            // Dirty cache: this will be calculated next time someone retrives the score
+            self.utilization_score = None;
+        }
    }

    /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -88,7 +112,38 @@ impl PageserverUtilization {
            disk_usable_pct: Percent::new(100).unwrap(),
            shard_count: 1,
            max_shard_count: 1,
-            utilization_score: Self::UTILIZATION_FULL,
+            utilization_score: Some(Self::UTILIZATION_FULL),
+            captured_at: serde_system_time::SystemTime(SystemTime::now()),
+        }
+    }
+}
+
+/// Test helper
+pub mod test_utilization {
+    use super::PageserverUtilization;
+    use std::time::SystemTime;
+    use utils::{
+        serde_percent::Percent,
+        serde_system_time::{self},
+    };
+
+    // Parameters of the imaginary node used for test utilization instances
+    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
+    const TEST_SHARDS_MAX: u32 = 1000;
+
+    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
+    /// not abuse this function from non-test code.
+    ///
+    /// Emulates a node with a 1000 shard limit and a 1TB disk.
+    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
+        PageserverUtilization {
+            disk_usage_bytes: disk_wanted_bytes,
+            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
+            disk_wanted_bytes,
+            disk_usable_pct: Percent::new(100).unwrap(),
+            shard_count,
+            max_shard_count: TEST_SHARDS_MAX,
+            utilization_score: None,
            captured_at: serde_system_time::SystemTime(SystemTime::now()),
        }
    }
@@ -120,7 +175,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            disk_wanted_bytes: u64::MAX,
-            utilization_score: 13,
+            utilization_score: Some(13),
            disk_usable_pct: Percent::new(90).unwrap(),
            shard_count: 100,
            max_shard_count: 200,
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,7 +18,6 @@ tokio-rustls.workspace = true
 tracing.workspace = true

 pq_proto.workspace = true
-workspace_hack.workspace = true

 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,7 +11,5 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,8 +19,6 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,8 +14,6 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,9 +11,7 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["io-util"] }
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-workspace_hack.workspace = true
+
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
        }
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
+        let properties_future = blob_client.get_properties().into_future();
+
+        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+
+        let res = tokio::select! {
+            res = properties_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        let data = match res {
+            Ok(Ok(data)) => Ok(data),
+            Ok(Err(sdk)) => Err(to_download_error(sdk)),
+            Err(_timeout) => Err(DownloadError::Timeout),
+        }?;
+
+        let properties = data.blob.properties;
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::from(properties.last_modified),
+            size: properties.content_length,
+        })
+    }
+
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -150,7 +150,7 @@ pub enum ListingMode {
    NoDelimiter,
 }

-#[derive(PartialEq, Eq, Debug)]
+#[derive(PartialEq, Eq, Debug, Clone)]
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
        Ok(combined)
    }

+    /// Obtain metadata information about an object.
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError>;
+
    /// Streams the local file contents into remote into the remote storage entry.
    ///
    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // See [`RemoteStorage::head_object`].
+    pub async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.head_object(key, cancel).await,
+            Self::AwsS3(s) => s.head_object(key, cancel).await,
+            Self::AzureBlob(s) => s.head_object(key, cancel).await,
+            Self::Unreliable(s) => s.head_object(key, cancel).await,
+        }
+    }
+
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
            RequestKind::Delete => &self.write,
            RequestKind::Copy => &self.write,
            RequestKind::TimeTravel => &self.write,
+            RequestKind::Head => &self.read,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
        }
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let target_file_path = key.with_base(&self.storage_root);
+        let metadata = file_metadata(&target_file_path).await?;
+        Ok(ListingObject {
+            key: key.clone(),
+            last_modified: metadata.modified()?,
+            size: metadata.len(),
+        })
+    }
+
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
    List = 3,
    Copy = 4,
    TimeTravel = 5,
+    Head = 6,
 }

 use scopeguard::ScopeGuard;
@@ -27,6 +28,7 @@ impl RequestKind {
            List => "list_objects",
            Copy => "copy_object",
            TimeTravel => "time_travel_recover",
+            Head => "head_object",
        }
    }
    const fn as_index(&self) -> usize {
@@ -34,7 +36,8 @@ impl RequestKind {
    }
 }

-pub(crate) struct RequestTyped<C>([C; 6]);
+const REQUEST_KIND_COUNT: usize = 7;
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);

 impl<C> RequestTyped<C> {
    pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
-        let arr = std::array::from_fn::<C, 6, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -23,7 +23,7 @@ use aws_config::{
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
-    operation::get_object::GetObjectError,
+    operation::{get_object::GetObjectError, head_object::HeadObjectError},
    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
    Client,
 };
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
        }
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let head_future = self
+            .client
+            .head_object()
+            .bucket(self.bucket_name())
+            .key(self.relative_path_to_s3_object(key))
+            .send();
+
+        let head_future = tokio::time::timeout(self.timeout, head_future);
+
+        let res = tokio::select! {
+            res = head_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let res = res.map_err(|_e| DownloadError::Timeout)?;
+
+        // do not incl. timeouts as errors in metrics but cancellations
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        let data = match res {
+            Ok(object_output) => object_output,
+            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
+                return Err(DownloadError::NotFound);
+            }
+            Err(e) => {
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                return Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("s3 head object"),
+                ));
+            }
+        };
+
+        let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
+            return Err(DownloadError::Other(anyhow!(
+                "head_object doesn't contain last_modified or content_length"
+            )))?;
+        };
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::try_from(last_modified).map_err(|e| {
+                DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
+            })?,
+            size: size as u64,
+        })
+    }
+
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
    ListPrefixes(Option<RemotePath>),
+    HeadObject(RemotePath),
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list(prefix, mode, max_keys, cancel).await
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<crate::ListingObject, DownloadError> {
+        self.attempt(RemoteOp::HeadObject(key.clone()))
+            .map_err(DownloadError::Other)?;
+        self.inner.head_object(key, cancel).await
+    }
+
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,5 +9,3 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,5 +9,3 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -39,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit.workspace = true
+toml_edit = { workspace = true, features = ["serde"] }
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -54,7 +54,6 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
-workspace_hack.workspace = true

 const_format.workspace = true

@@ -71,6 +70,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
+tokio = { workspace = true, features = ["test-util"] }

 [[bench]]
 name = "benchmarks"
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,8 +9,6 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true

-workspace_hack.workspace = true
-
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -95,6 +95,7 @@ fn main() -> anyhow::Result<()> {
        .allowlist_var("ERROR")
        .allowlist_var("FATAL")
        .allowlist_var("PANIC")
+        .allowlist_var("PG_VERSION_NUM")
        .allowlist_var("WPEVENT")
        .allowlist_var("WL_LATCH_SET")
        .allowlist_var("WL_SOCKET_READABLE")
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -282,7 +282,11 @@ mod tests {
    use std::cell::UnsafeCell;
    use utils::id::TenantTimelineId;

-    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
+    use crate::{
+        api_bindings::Level,
+        bindings::{NeonWALReadResult, PG_VERSION_NUM},
+        walproposer::Wrapper,
+    };

    use super::ApiImpl;

@@ -489,41 +493,79 @@ mod tests {

        let (sender, receiver) = sync_channel(1);

+        // Messages definitions are at walproposer.h
+        // xxx: it would be better to extract them from safekeeper crate and
+        // use serialization/deserialization here.
+        let greeting_tag = (b'g' as u64).to_ne_bytes();
+        let proto_version = 2_u32.to_ne_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let system_id = 0_u64.to_ne_bytes();
+        let tenant_id = ttid.tenant_id.as_arr();
+        let timeline_id = ttid.timeline_id.as_arr();
+        let pg_tli = 1_u32.to_ne_bytes();
+        let wal_seg_size = 16777216_u32.to_ne_bytes();
+        let proposer_greeting = [
+            greeting_tag.as_slice(),
+            proto_version.as_slice(),
+            pg_version.as_slice(),
+            proposer_id.as_slice(),
+            system_id.as_slice(),
+            tenant_id.as_slice(),
+            timeline_id.as_slice(),
+            pg_tli.as_slice(),
+            wal_seg_size.as_slice(),
+        ]
+        .concat();
+
+        let voting_tag = (b'v' as u64).to_ne_bytes();
+        let vote_request_term = 3_u64.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let vote_request = [
+            voting_tag.as_slice(),
+            vote_request_term.as_slice(),
+            proposer_id.as_slice(),
+        ]
+        .concat();
+
+        let acceptor_greeting_term = 2_u64.to_ne_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
+        let acceptor_greeting = [
+            greeting_tag.as_slice(),
+            acceptor_greeting_term.as_slice(),
+            acceptor_greeting_node_id.as_slice(),
+        ]
+        .concat();
+
+        let vote_response_term = 3_u64.to_ne_bytes();
+        let vote_given = 1_u64.to_ne_bytes();
+        let flush_lsn = 0x539_u64.to_ne_bytes();
+        let truncate_lsn = 0x539_u64.to_ne_bytes();
+        let th_len = 1_u32.to_ne_bytes();
+        let th_term = 2_u64.to_ne_bytes();
+        let th_lsn = 0x539_u64.to_ne_bytes();
+        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
+        let vote_response = [
+            voting_tag.as_slice(),
+            vote_response_term.as_slice(),
+            vote_given.as_slice(),
+            flush_lsn.as_slice(),
+            truncate_lsn.as_slice(),
+            th_len.as_slice(),
+            th_term.as_slice(),
+            th_lsn.as_slice(),
+            timeline_start_lsn.as_slice(),
+        ]
+        .concat();
+
        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
            wait_events: Cell::new(WaitEventsData {
                sk: std::ptr::null_mut(),
                event_mask: 0,
            }),
-            expected_messages: vec![
-                // TODO: When updating Postgres versions, this test will cause
-                // problems. Postgres version in message needs updating.
-                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
-                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
-                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
-                ],
-                // VoteRequest(VoteRequest { term: 3 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            expected_messages: vec![proposer_greeting, vote_request],
            expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![
-                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                ],
-                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
-                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
-                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            safekeeper_replies: vec![acceptor_greeting, vote_response],
            replies_ptr: AtomicUsize::new(0),
            sync_channel: sender,
            shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -10,6 +10,7 @@ use pageserver::{
    page_cache,
    repository::Value,
    task_mgr::TaskKind,
+    tenant::storage_layer::inmemory_layer::SerializedBatch,
    tenant::storage_layer::InMemoryLayer,
    virtual_file,
 };
@@ -67,12 +68,16 @@ async fn ingest(
    let layer =
        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;

-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
+    let data_ser_size = data.serialized_size().unwrap() as usize;
    let ctx = RequestContext::new(
        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
        pageserver::context::DownloadBehavior::Download,
    );

+    const BATCH_SIZE: usize = 16;
+    let mut batch = Vec::new();
+
    for i in 0..put_count {
        lsn += put_size as u64;

@@ -95,7 +100,17 @@ async fn ingest(
            }
        }

-        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
+        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
+        if batch.len() >= BATCH_SIZE {
+            let this_batch = std::mem::take(&mut batch);
+            let serialized = SerializedBatch::from_values(this_batch);
+            layer.put_batch(serialized, &ctx).await?;
+        }
+    }
+    if !batch.is_empty() {
+        let this_batch = std::mem::take(&mut batch);
+        let serialized = SerializedBatch::from_values(this_batch);
+        layer.put_batch(serialized, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,10 +126,56 @@ fn main() -> anyhow::Result<()> {
    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

+    // The tenants directory contains all the pageserver local disk state.
+    // Create if not exists and make sure all the contents are durable before proceeding.
+    // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
+    // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
+    // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
    let tenants_path = conf.tenants_path();
-    if !tenants_path.exists() {
-        utils::crashsafe::create_dir_all(conf.tenants_path())
-            .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
+    {
+        let open = || {
+            nix::dir::Dir::open(
+                tenants_path.as_std_path(),
+                nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
+                nix::sys::stat::Mode::empty(),
+            )
+        };
+        let dirfd = match open() {
+            Ok(dirfd) => dirfd,
+            Err(e) => match e {
+                nix::errno::Errno::ENOENT => {
+                    utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
+                        format!("Failed to create tenants root dir at '{tenants_path}'")
+                    })?;
+                    open().context("open tenants dir after creating it")?
+                }
+                e => anyhow::bail!(e),
+            },
+        };
+
+        let started = Instant::now();
+        // Linux guarantees durability for syncfs.
+        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+        #[cfg(target_os = "linux")]
+        {
+            use std::os::fd::AsRawFd;
+            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
+        }
+        #[cfg(target_os = "macos")]
+        {
+            // macOS is not a production platform for Neon, don't even bother.
+            drop(dirfd);
+        }
+        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        {
+            compile_error!("Unsupported OS");
+        }
+
+        let elapsed = started.elapsed();
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "made tenant directory contents durable"
+        );
    }

    // Initialize up failpoints support
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -318,6 +318,24 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
    }
 }

+impl From<crate::tenant::TimelineArchivalError> for ApiError {
+    fn from(value: crate::tenant::TimelineArchivalError) -> Self {
+        use crate::tenant::TimelineArchivalError::*;
+        match value {
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
+            Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
+                format!(
+                    "Cannot archive timeline which has non-archived child timelines: {children:?}"
+                )
+                .into_boxed_str(),
+            ),
+            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            Other(e) => ApiError::InternalServerError(e),
+        }
+    }
+}
+
 impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
        use crate::tenant::mgr::DeleteTimelineError::*;
@@ -405,6 +423,8 @@ async fn build_timeline_info_common(
    let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
+    // Report is_archived = false if the timeline is still loading
+    let is_archived = timeline.is_archived().unwrap_or(false);
    let remote_consistent_lsn_projected = timeline
        .get_remote_consistent_lsn_projected()
        .unwrap_or(Lsn(0));
@@ -445,6 +465,7 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
+        is_archived,

        walreceiver_status,

@@ -686,9 +707,7 @@ async fn timeline_archival_config_handler(

        tenant
            .apply_timeline_archival_config(timeline_id, request_data.state)
-            .await
-            .context("applying archival config")
-            .map_err(ApiError::InternalServerError)?;
+            .await?;
        Ok::<_, ApiError>(())
    }
    .instrument(info_span!("timeline_archival_config",
@@ -1706,11 +1725,6 @@ async fn timeline_compact_handler(
        flags |= CompactFlags::ForceImageLayerCreation;
    }
    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
-        if !cfg!(feature = "testing") {
-            return Err(ApiError::InternalServerError(anyhow!(
-                "enhanced_gc_bottom_most_compaction is only available in testing mode"
-            )));
-        }
        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
    }
    let wait_until_uploaded =
@@ -2942,7 +2956,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
+            |r| api_handler(r, timeline_compact_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,15 +1,10 @@
 use std::{num::NonZeroUsize, sync::Arc};

-use crate::tenant::ephemeral_file;
-
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
+    Direct { max_concurrency: NonZeroUsize },
 }

 impl Default for L0FlushConfig {
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);

 pub enum Inner {
-    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }

 impl L0FlushGlobalState {
    pub fn new(config: L0FlushConfig) -> Self {
        match config {
-            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
            L0FlushConfig::Direct { max_concurrency } => {
                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                Self(Arc::new(Inner::Direct { semaphore }))
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
        &self.0
    }
 }
-
-impl L0FlushConfig {
-    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
-        use L0FlushConfig::*;
-        match self {
-            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
-            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
-        }
-    }
-}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
 ) {
    use std::time::Duration;

+    let started_at = std::time::Instant::now();
+
    // If the orderly shutdown below takes too long, we still want to make
    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
    //
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
    walredo_extraordinary_shutdown_thread.join().unwrap();
    info!("walredo_extraordinary_shutdown_thread done");

-    info!("Shut down successfully completed");
+    info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        "Shut down successfully completed"
+    );
    std::process::exit(exit_code);
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,23 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
    .expect("failed to define a metric")
 });

+pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_utilization_score",
+        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_heatmap_total_size",
+        "The total size in bytes of all layers in the most recently downloaded heatmap.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1853,16 +1870,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_background_loop_semaphore_wait_start_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls started",
-        "pageserver_background_loop_semaphore_wait_finish_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-        &["task"],
-    )
-    .unwrap()
-});
+pub struct BackgroundLoopSemaphoreMetrics {
+    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
+    durations: EnumMap<BackgroundLoopKind, Counter>,
+}
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
+    || {
+        let counters = register_int_counter_pair_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap();
+
+        let durations = register_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_duration_seconds",
+            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+            &["task"],
+        )
+        .unwrap();
+
+        BackgroundLoopSemaphoreMetrics {
+            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                counters.with_label_values(&[kind.into()])
+            })),
+            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                durations.with_label_values(&[kind.into()])
+            })),
+        }
+    },
+);
+
+impl BackgroundLoopSemaphoreMetrics {
+    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
+        struct Record<'a> {
+            metrics: &'a BackgroundLoopSemaphoreMetrics,
+            task: BackgroundLoopKind,
+            _counter_guard: metrics::IntCounterPairGuard,
+            start: Instant,
+        }
+        impl Drop for Record<'_> {
+            fn drop(&mut self) {
+                let elapsed = self.start.elapsed().as_secs_f64();
+                self.metrics.durations[self.task].inc_by(elapsed);
+            }
+        }
+        Record {
+            metrics: self,
+            task,
+            _counter_guard: self.counters[task].guard(),
+            start: Instant::now(),
+        }
+    }
+}

 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
@@ -2544,6 +2609,7 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
+use crate::tenant::tasks::BackgroundLoopKind;

 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,12 +15,11 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -174,6 +172,7 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
+            pending_bytes: 0,
            lsn,
        }
    }
@@ -727,7 +726,17 @@ impl Timeline {
    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
        let current_policy = self.last_aux_file_policy.load();
        match current_policy {
-            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V1) => {
+                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
+                self.list_aux_files_v1(lsn, ctx).await
+            }
+            None => {
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                if !res.is_empty() {
+                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
+                }
+                Ok(res)
+            }
            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
            Some(AuxFilePolicy::CrossValidation) => {
                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -1022,21 +1031,33 @@ pub struct DatadirModification<'a> {
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+
+    /// An **approximation** of how large our EphemeralFile write will be when committed.
+    pending_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
+    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
+    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
+    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
+    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
+
    /// Get the current lsn
    pub(crate) fn get_lsn(&self) -> Lsn {
        self.lsn
    }

+    pub(crate) fn approx_pending_bytes(&self) -> usize {
+        self.pending_bytes
+    }
+
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1576,6 +1597,7 @@ impl<'a> DatadirModification<'a> {
                if aux_files_key_v1.is_empty() {
                    None
                } else {
+                    warn!("this timeline is using deprecated aux file policy V1");
                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                    Some(AuxFilePolicy::V1)
                }
@@ -1769,21 +1791,25 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
+            let mut write_batch = Vec::new();
+            for (lsn, value_ser_size, value) in values {
                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
+                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
                } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
+                    retained_pending_updates.entry(key).or_default().push((
+                        lsn,
+                        value_ser_size,
+                        value,
+                    ));
                }
            }
+            writer.put_batch(write_batch, ctx).await?;
        }

        self.pending_updates = retained_pending_updates;
+        self.pending_bytes = 0;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1809,17 +1835,20 @@ impl<'a> DatadirModification<'a> {
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            // Ordering: the items in this batch do not need to be in any global order, but values for
+            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+            // this to do efficient updates to its index.
+            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
+                .pending_updates
+                .drain()
+                .flat_map(|(key, values)| {
+                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
+                        (key.to_compact(), lsn, val_ser_size, value)
+                    })
+                })
+                .collect::<Vec<_>>();

-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1844,6 +1873,8 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

+        self.pending_bytes = 0;
+
        Ok(())
    }

@@ -1860,7 +1891,7 @@ impl<'a> DatadirModification<'a> {
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
+            if let Some((_, _, value)) = values.last() {
                return if let Value::Image(img) = value {
                    Ok(img.clone())
                } else {
@@ -1888,13 +1919,17 @@ impl<'a> DatadirModification<'a> {
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
+                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                *last_value = val;
                return;
            }
        }
-        values.push((self.lsn, val));
+
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+        self.pending_bytes += val_serialized_size;
+        values.push((self.lsn, val_serialized_size, val));
    }

    fn delete(&mut self, key_range: Range<Key>) {
@@ -2024,7 +2059,7 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -393,7 +393,7 @@ struct PageServerTask {

    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
    timeline_id: Option<TimelineId>,

    mutable: Mutex<MutableTaskState>,
@@ -405,7 +405,7 @@ struct PageServerTask {
 pub fn spawn<F>(
    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
    timeline_id: Option<TimelineId>,
    name: &str,
    future: F,
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
+                && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
@@ -573,13 +573,8 @@ pub async fn shutdown_tasks(
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
-                if tenant_shard_id.is_none() {
-                    // there are quite few of these
-                    info!(name = task.name, kind = ?task_kind, "stopping global task");
-                } else {
-                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
-                }
+                // warn to catch these in tests; there shouldn't be any
+                warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
            }
            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
                .await
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError {
    }
 }

+#[derive(thiserror::Error)]
+pub enum TimelineArchivalError {
+    #[error("NotFound")]
+    NotFound,
+
+    #[error("Timeout")]
+    Timeout,
+
+    #[error("HasUnarchivedChildren")]
+    HasUnarchivedChildren(Vec<TimelineId>),
+
+    #[error("Timeline archival is already in progress")]
+    AlreadyInProgress,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl Debug for TimelineArchivalError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::Timeout => write!(f, "Timeout"),
+            Self::HasUnarchivedChildren(c) => {
+                f.debug_tuple("HasUnarchivedChildren").field(c).finish()
+            }
+            Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
@@ -798,7 +830,7 @@ impl Tenant {
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            TaskKind::Attach,
-            Some(tenant_shard_id),
+            tenant_shard_id,
            None,
            "attach tenant",
            async move {
@@ -1326,24 +1358,50 @@ impl Tenant {
        &self,
        timeline_id: TimelineId,
        state: TimelineArchivalState,
-    ) -> anyhow::Result<()> {
-        let timeline = self
-            .get_timeline(timeline_id, false)
-            .context("Cannot apply timeline archival config to inexistent timeline")?;
+    ) -> Result<(), TimelineArchivalError> {
+        info!("setting timeline archival config");
+        let timeline = {
+            let timelines = self.timelines.lock().unwrap();
+
+            let timeline = match timelines.get(&timeline_id) {
+                Some(t) => t,
+                None => return Err(TimelineArchivalError::NotFound),
+            };
+
+            // Ensure that there are no non-archived child timelines
+            let children: Vec<TimelineId> = timelines
+                .iter()
+                .filter_map(|(id, entry)| {
+                    if entry.get_ancestor_timeline_id() != Some(timeline_id) {
+                        return None;
+                    }
+                    if entry.is_archived() == Some(true) {
+                        return None;
+                    }
+                    Some(*id)
+                })
+                .collect();
+
+            if !children.is_empty() && state == TimelineArchivalState::Archived {
+                return Err(TimelineArchivalError::HasUnarchivedChildren(children));
+            }
+            Arc::clone(timeline)
+        };

        let upload_needed = timeline
            .remote_client
            .schedule_index_upload_for_timeline_archival_state(state)?;

        if upload_needed {
+            info!("Uploading new state");
            const MAX_WAIT: Duration = Duration::from_secs(10);
            let Ok(v) =
                tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
            else {
                tracing::warn!("reached timeout for waiting on upload queue");
-                bail!("reached timeout for upload queue flush");
+                return Err(TimelineArchivalError::Timeout);
            };
-            v?;
+            v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
        }
        Ok(())
    }
@@ -3741,13 +3799,21 @@ impl Tenant {
    /// less than this (via eviction and on-demand downloads), but this function enables
    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
+    ///
+    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
+    /// than they report here, due to layer eviction.  Tenants with many active branches may
+    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
-        let mut wanted = 0;
        let timelines = self.timelines.lock().unwrap();
-        for timeline in timelines.values() {
-            wanted += timeline.metrics.visible_physical_size_gauge.get();
-        }
-        wanted
+
+        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
+        // reflects the observation that on tenants with multiple large branches, typically only one
+        // of them is used actively enough to occupy space on disk.
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .max()
+            .unwrap_or(0)
    }
 }

@@ -5932,10 +5998,10 @@ mod tests {
            .await
            .unwrap();

-        // the default aux file policy to switch is v1 if not set by the admins
+        // the default aux file policy to switch is v2 if not set by the admins
        assert_eq!(
            harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::V1
+            AuxFilePolicy::default_tenant_config()
        );
        let (tenant, ctx) = harness.load().await;

@@ -5979,8 +6045,8 @@ mod tests {
        );
        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+            Some(AuxFilePolicy::V2),
+            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
        );

        // we can read everything from the storage
@@ -6002,8 +6068,8 @@ mod tests {

        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep v1 storage format when new files are written"
+            Some(AuxFilePolicy::V2),
+            "keep v2 storage format when new files are written"
        );

        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6019,7 +6085,7 @@ mod tests {

        // child copies the last flag even if that is not on remote storage yet
        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));

        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
        assert_eq!(files.get("pg_logical/mappings/test1"), None);
@@ -7005,18 +7071,14 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: {
-                        let mut key = Key::MAX;
-                        key.field6 -= 1;
-                        Key::MIN..key
-                    },
+                    key_range: Key::MIN..Key::NON_L0_MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
-                // The delta layer that is cut in the middle
+                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
                PersistentLayerKey {
-                    key_range: get_key(3)..get_key(4),
-                    lsn_range: Lsn(0x30)..Lsn(0x41),
+                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    lsn_range: Lsn(0x30)..Lsn(0x48),
                    is_delta: true
                },
                // The delta3 layer that should not be picked for the compaction
@@ -7996,6 +8058,214 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
+    {
+        let harness =
+            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
+                .await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1 and delta 2 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id),
+                    (Lsn(0x20), tline.timeline_id),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        // compact again
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
 }

 mod page_caching;
-pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;

 impl EphemeralFile {
@@ -52,12 +51,10 @@ impl EphemeralFile {
        )
        .await?;

-        let prewarm = conf.l0_flush.prewarm_on_write();
-
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, gate_guard),
        })
    }

@@ -82,6 +79,8 @@ impl EphemeralFile {
        self.rw.read_blk(blknum, ctx).await
    }

+    #[cfg(test)]
+    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
@@ -89,17 +88,30 @@ impl EphemeralFile {
    ) -> Result<u64, io::Error> {
        let pos = self.rw.bytes_written();

-        // Write the length field
-        if srcbuf.len() < 0x80 {
-            // short one-byte length header
-            let len_buf = [srcbuf.len() as u8];
+        let mut len_bytes = std::io::Cursor::new(Vec::new());
+        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
+            srcbuf.len(),
+            &mut len_bytes,
+        );
+        let len_bytes = len_bytes.into_inner();

-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        } else {
-            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
-            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        }
+        // Write the length field
+        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+
+        // Write the payload
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+
+        Ok(pos)
+    }
+
+    /// Returns the offset at which the first byte of the input was written, for use
+    /// in constructing indices over the written value.
+    pub(crate) async fn write_raw(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();

        // Write the payload
        self.rw.write_all_borrowed(srcbuf, ctx).await?;
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,15 +1,15 @@
 //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
 //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+//!
+//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>

 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::VirtualFile;

-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use std::ops::{Deref, Range};
+use std::io::{self};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;

@@ -18,33 +18,17 @@ use super::zero_padded_read_write;
 /// See module-level comment.
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
    _gate_guard: utils::sync::gate::GateGuard,
 }

-/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
-/// should we pre-warm the [`crate::page_cache`] with the contents?
-#[derive(Clone, Copy)]
-pub enum PrewarmOnWrite {
-    Yes,
-    No,
-}
-
 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-                prewarm_on_write,
-            )),
+            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
            _gate_guard,
        }
    }
@@ -84,10 +68,10 @@ impl RW {
        let vec = Vec::with_capacity(size);

        // read from disk what we've already flushed
-        let writer = self.rw.as_writer();
-        let flushed_range = writer.written_range();
-        let mut vec = writer
-            .file
+        let file_size_tracking_writer = self.rw.as_writer();
+        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
+        let mut vec = file_size_tracking_writer
+            .as_inner()
            .read_exact_at(
                vec.slice(0..(flushed_range.end - flushed_range.start)),
                u64::try_from(flushed_range.start).unwrap(),
@@ -122,7 +106,7 @@ impl RW {
                            format!(
                                "ephemeral file: read immutable page #{}: {}: {:#}",
                                blknum,
-                                self.rw.as_writer().file.path,
+                                self.rw.as_writer().as_inner().path,
                                e,
                            ),
                        )
@@ -132,7 +116,7 @@ impl RW {
                    }
                    page_cache::ReadBufResult::NotFound(write_guard) => {
                        let write_guard = writer
-                            .file
+                            .as_inner()
                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                            .await?;
                        let read_guard = write_guard.mark_valid();
@@ -154,137 +138,16 @@ impl Drop for RW {

        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        let path = &self.rw.as_writer().as_inner().path;
+        let res = std::fs::remove_file(path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
                // just never log the not found errors, we cannot do anything for them; on detach
                // the tenant directory is already gone.
                //
                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
+                error!("could not remove ephemeral file '{path}': {e}");
            }
        }
    }
 }
-
-struct PreWarmingWriter {
-    prewarm_on_write: PrewarmOnWrite,
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(
-        page_cache_file_id: page_cache::FileId,
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-    ) -> Self {
-        Self {
-            prewarm_on_write,
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-
-    /// Return the byte range within `file` that has been written though `write_all`.
-    ///
-    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
-    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
-        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
-        struct Wrapper(Range<usize>);
-        impl Deref for Wrapper {
-            type Target = Range<usize>;
-            fn deref(&self) -> &Range<usize> {
-                &self.0
-            }
-        }
-        Wrapper(0..nwritten_blocks * PAGE_SZ)
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let buf = match self.file.write_all(buf, ctx).await {
-            (buf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                buf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-
-        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            // Pre-warm page cache with the contents.
-            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-            // benefits the code that writes InMemoryLayer=>L0 layers.
-
-            let cache = page_cache::get();
-            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-                RequestContext::new(
-                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                    crate::context::DownloadBehavior::Error,
-                )
-            });
-            for blknum_in_buffer in 0..nblocks {
-                let blk_in_buffer =
-                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-                let blknum = self
-                    .nwritten_blocks
-                    .checked_add(blknum_in_buffer as u32)
-                    .unwrap();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    .await
-                {
-                    Err(e) => {
-                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                    }
-                    Ok(v) => match v {
-                        page_cache::ReadBufResult::Found(_guard) => {
-                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                        }
-                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                            write_guard.copy_from_slice(blk_in_buffer);
-                            let _ = write_guard.mark_valid();
-                        }
-                    },
-                }
-            }
-        }
-
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf))
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -464,7 +464,7 @@ impl LayerMap {
    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

@@ -483,7 +483,7 @@ impl LayerMap {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -600,8 +600,8 @@ impl LayerMap {
    }

    /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>) -> bool {
-        key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
+        is_delta_layer && key_range == &(Key::MIN..Key::MAX)
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -628,7 +628,7 @@ impl LayerMap {
    ///      than just the current partition_range.
    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
-        if !Self::is_l0(&layer.key_range) {
+        if !Self::is_l0(&layer.key_range, layer.is_delta) {
            return true;
        }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -565,7 +565,7 @@ mod tests {
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
+            74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -574,7 +574,7 @@ mod tests {
            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
+            0, 0, 0, 16, // pg_version (4 bytes)
            /* padding bytes */
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id),
+                self.tenant_shard_id,
                Some(self.timeline_id),
                "remote upload",
                async move {
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
    context::RequestContext,
    disk_usage_eviction_task::DiskUsageEvictionInfo,
+    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
 };

@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {

    // Sum of layer sizes on local disk
    pub(super) resident_size_metric: UIntGauge,
+
+    // Sum of layer sizes in the most recently downloaded heatmap
+    pub(super) heatmap_total_size_metric: UIntGauge,
 }

 impl Drop for SecondaryTenant {
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
    }
 }

@@ -128,6 +133,10 @@ impl SecondaryTenant {
            .get_metric_with_label_values(&[&tenant_id, &shard_id])
            .unwrap();

+        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -145,6 +154,7 @@ impl SecondaryTenant {
            progress: std::sync::Mutex::default(),

            resident_size_metric,
+            heatmap_total_size_metric,
        })
    }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
            layers_downloaded: 0,
            bytes_downloaded: 0,
        };
+
+        // Also expose heatmap bytes_total as a metric
+        self.secondary_state
+            .heatmap_total_size_metric
+            .set(heatmap_stats.bytes);
+
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,13 +2,12 @@

 pub mod delta_layer;
 pub mod image_layer;
-pub(crate) mod inmemory_layer;
+pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

-#[cfg(test)]
 pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,6 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -232,6 +233,18 @@ pub struct DeltaLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

+impl DeltaLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "delta {}..{} {}..{}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn_range().start,
+            self.lsn_range().end
+        )
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -556,7 +569,6 @@ impl DeltaLayerWriterInner {
        // 5GB limit for objects without multipart upload (which we don't want to use)
        // Make it a little bit below to account for differing GB units
        // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
-        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
        ensure!(
            metadata.len() <= S3_UPLOAD_LIMIT,
            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
@@ -690,12 +702,10 @@ impl DeltaLayerWriter {
        self.inner.take().unwrap().finish(key_end, ctx).await
    }

-    #[cfg(test)]
    pub(crate) fn num_keys(&self) -> usize {
        self.inner.as_ref().unwrap().num_keys
    }

-    #[cfg(test)]
    pub(crate) fn estimated_size(&self) -> u64 {
        let inner = self.inner.as_ref().unwrap();
        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
@@ -1527,6 +1537,10 @@ pub struct DeltaLayerIterator<'a> {
 }

 impl<'a> DeltaLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.delta_layer.layer_dbg_info()
+    }
+
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -167,6 +167,17 @@ pub struct ImageLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

+impl ImageLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "image {}..{} {}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn()
+        )
+    }
+}
+
 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
@@ -705,10 +716,6 @@ struct ImageLayerWriterInner {
 }

 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
    ///
    /// Start building a new image layer.
    ///
@@ -843,13 +850,19 @@ impl ImageLayerWriterInner {
            res?;
        }

+        let final_key_range = if let Some(end_key) = end_key {
+            self.key_range.start..end_key
+        } else {
+            self.key_range.clone()
+        };
+
        // Fill in the summary on blk 0
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
            tenant_id: self.tenant_shard_id.tenant_id,
            timeline_id: self.timeline_id,
-            key_range: self.key_range.clone(),
+            key_range: final_key_range.clone(),
            lsn: self.lsn,
            index_start_blk,
            index_root_blk,
@@ -870,11 +883,7 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            final_key_range,
            self.lsn,
            metadata.len(),
        );
@@ -963,14 +972,12 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

-    #[cfg(test)]
    /// Estimated size of the image layer.
    pub(crate) fn estimated_size(&self) -> u64 {
        let inner = self.inner.as_ref().unwrap();
        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
    }

-    #[cfg(test)]
    pub(crate) fn num_keys(&self) -> usize {
        self.inner.as_ref().unwrap().num_keys
    }
@@ -986,7 +993,6 @@ impl ImageLayerWriter {
        self.inner.take().unwrap().finish(timeline, ctx, None).await
    }

-    #[cfg(test)]
    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
@@ -1000,10 +1006,6 @@ impl ImageLayerWriter {
            .finish(timeline, ctx, Some(end_key))
            .await
    }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
-    }
 }

 impl Drop for ImageLayerWriter {
@@ -1024,6 +1026,10 @@ pub struct ImageLayerIterator<'a> {
 }

 impl<'a> ImageLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.image_layer.layer_dbg_info()
+    }
+
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache, walrecord};
+use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -33,7 +33,7 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;

 use super::{
    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
@@ -249,9 +249,7 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().await;
-
+    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let end_str = self.end_lsn_or_max();

        println!(
@@ -259,39 +257,6 @@ impl InMemoryLayer {
            self.timeline_id, self.start_lsn, end_str,
        );

-        if !verbose {
-            return Ok(());
-        }
-
-        let cursor = inner.file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, vec_map) in inner.index.iter() {
-            for (lsn, pos) in vec_map.as_slice() {
-                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                let val = Value::des(&buf);
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
-                    }
-                }
-                println!("  key {} at {}: {}", key, lsn, desc);
-            }
-        }
-
        Ok(())
    }

@@ -355,6 +320,82 @@ impl InMemoryLayer {
    }
 }

+/// Offset of a particular Value within a serialized batch.
+struct SerializedBatchOffset {
+    key: CompactKey,
+    lsn: Lsn,
+    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
+    offset: u64,
+}
+
+pub struct SerializedBatch {
+    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
+    pub(crate) raw: Vec<u8>,
+
+    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
+    offsets: Vec<SerializedBatchOffset>,
+
+    /// The highest LSN of any value in the batch
+    pub(crate) max_lsn: Lsn,
+}
+
+impl SerializedBatch {
+    /// Write a blob length in the internal format of the EphemeralFile
+    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
+        use std::io::Write;
+
+        if len < 0x80 {
+            // short one-byte length header
+            let len_buf = [len as u8];
+
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        } else {
+            let mut len_buf = u32::to_be_bytes(len as u32);
+            len_buf[0] |= 0x80;
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        }
+    }
+
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
+        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
+        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
+
+        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        for (key, lsn, val_ser_size, val) in batch {
+            let relative_off = cursor.position();
+
+            Self::write_blob_length(val_ser_size, &mut cursor);
+            val.ser_into(&mut cursor)
+                .expect("Writing into in-memory buffer is infallible");
+
+            offsets.push(SerializedBatchOffset {
+                key,
+                lsn,
+                offset: relative_off,
+            });
+            max_lsn = std::cmp::max(max_lsn, lsn);
+        }
+
+        let buffer = cursor.into_inner();
+
+        // Assert that we didn't do any extra allocations while building buffer.
+        debug_assert!(buffer.len() <= buffer_size);
+
+        Self {
+            raw: buffer,
+            offsets,
+            max_lsn,
+        }
+    }
+}
+
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -415,37 +456,20 @@ impl InMemoryLayer {
        })
    }

-    // Write operations
-
-    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
-    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    // Write path.
+    pub async fn put_batch(
        &self,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
+        serialized_batch: SerializedBatch,
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
-    }

-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-
-        let off = {
-            locked_inner
+        let base_off = {
+            inner
                .file
-                .write_blob(
-                    buf,
+                .write_raw(
+                    &serialized_batch.raw,
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::InMemoryLayer)
                        .build(),
@@ -453,15 +477,23 @@ impl InMemoryLayer {
                .await?
        };

-        let vec_map = locked_inner.index.entry(key).or_default();
-        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            warn!("Key {} at {} already exists", key, lsn);
+        for SerializedBatchOffset {
+            key,
+            lsn,
+            offset: relative_off,
+        } in serialized_batch.offsets
+        {
+            let off = base_off + relative_off;
+            let vec_map = inner.index.entry(key).or_default();
+            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+            if old.is_some() {
+                // We already had an entry for this LSN. That's odd..
+                warn!("Key {} at {} already exists", key, lsn);
+            }
        }

-        let size = locked_inner.file.len();
-        locked_inner.resource_units.maybe_publish_size(size);
+        let size = inner.file.len();
+        inner.resource_units.maybe_publish_size(size);

        Ok(())
    }
@@ -536,7 +568,6 @@ impl InMemoryLayer {

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
-            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };

@@ -568,34 +599,6 @@ impl InMemoryLayer {
        .await?;

        match l0_flush_global_state {
-            l0_flush::Inner::PageCached => {
-                let ctx = RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::InMemoryLayer)
-                    .build();
-
-                let mut buf = Vec::new();
-
-                let cursor = inner.file.block_cursor();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
-                            .put_value_bytes(
-                                Key::from_compact(*key),
-                                *lsn,
-                                buf.slice_len(),
-                                will_init,
-                                &ctx,
-                            )
-                            .await;
-                        res?;
-                        buf = tmp.into_raw_slice().into_inner();
-                    }
-                }
-            }
            l0_flush::Inner::Direct { .. } => {
                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                assert_eq!(
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -35,6 +35,8 @@ mod tests;
 #[cfg(test)]
 mod failpoints;

+pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -1296,7 +1298,10 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(
+                    &self.layer_desc().key_range,
+                    self.layer_desc().is_delta,
+                ),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -256,6 +256,10 @@ impl LayerName {
            LayerName::Delta(layer) => &layer.key_range,
        }
    }
+
+    pub fn is_delta(&self) -> bool {
+        matches!(self, LayerName::Delta(_))
+    }
 }

 impl fmt::Display for LayerName {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -3,6 +3,7 @@ use std::{
    collections::{binary_heap, BinaryHeap},
 };

+use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;

@@ -26,6 +27,13 @@ impl<'a> LayerRef<'a> {
            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
        }
    }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }

 enum LayerIterRef<'a> {
@@ -40,6 +48,13 @@ impl LayerIterRef<'_> {
            Self::Image(x) => x.next().await,
        }
    }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }

 /// This type plays several roles at once
@@ -75,6 +90,11 @@ impl<'a> PeekableLayerIterRef<'a> {
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        let result = self.peeked.take();
        self.peeked = self.iter.next().await?;
+        if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
+            if (k1, l1) < (k2, l2) {
+                bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
+            }
+        }
        Ok(result)
    }
 }
@@ -178,7 +198,12 @@ impl<'a> IteratorWrapper<'a> {
        let iter = PeekableLayerIterRef::create(iter).await?;
        if let Some((k1, l1, _)) = iter.peek() {
            let (k2, l2) = first_key_lower_bound;
-            debug_assert!((k1, l1) >= (k2, l2));
+            if (k1, l1) < (k2, l2) {
+                bail!(
+                    "layer key range did not include the first key in the layer: {}",
+                    layer.layer_dbg_info()
+                );
+            }
        }
        *self = Self::Loaded { iter };
        Ok(())
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,4 +1,4 @@
-use std::{ops::Range, sync::Arc};
+use std::{future::Future, ops::Range, sync::Arc};

 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
@@ -7,7 +7,32 @@ use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};

-use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
+use super::layer::S3_UPLOAD_LIMIT;
+use super::{
+    DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
+};
+
+pub(crate) enum SplitWriterResult {
+    Produced(ResidentLayer),
+    Discarded(PersistentLayerKey),
+}
+
+#[cfg(test)]
+impl SplitWriterResult {
+    fn into_resident_layer(self) -> ResidentLayer {
+        match self {
+            SplitWriterResult::Produced(layer) => layer,
+            SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
+        }
+    }
+
+    fn into_discarded_layer(self) -> PersistentLayerKey {
+        match self {
+            SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
+            SplitWriterResult::Discarded(layer) => layer,
+        }
+    }
+}

 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -16,11 +41,12 @@ use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
+    generated_layers: Vec<SplitWriterResult>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
    lsn: Lsn,
+    start_key: Key,
 }

 impl SplitImageLayerWriter {
@@ -49,16 +75,22 @@ impl SplitImageLayerWriter {
            timeline_id,
            tenant_shard_id,
            lsn,
+            start_key,
        })
    }

-    pub async fn put_image(
+    pub async fn put_image_with_discard_fn<D, F>(
        &mut self,
        key: Key,
        img: Bytes,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
        // The current estimation is an upper bound of the space that the key/image could take
        // because we did not consider compression in this estimation. The resulting image layer
        // could be smaller than the target size.
@@ -76,33 +108,87 @@ impl SplitImageLayerWriter {
            )
            .await?;
            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
+            let layer_key = PersistentLayerKey {
+                key_range: self.start_key..key,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+                is_delta: false,
+            };
+            self.start_key = key;
+
+            if discard(&layer_key).await {
+                drop(prev_image_writer);
+                self.generated_layers
+                    .push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                self.generated_layers.push(SplitWriterResult::Produced(
+                    prev_image_writer
+                        .finish_with_end_key(tline, key, ctx)
+                        .await?,
+                ));
+            }
        }
        self.inner.put_image(key, img, ctx).await
    }

-    pub(crate) async fn finish(
+    #[cfg(test)]
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
+            .await
+    }
+
+    pub(crate) async fn finish_with_discard_fn<D, F>(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
        let Self {
            mut generated_layers,
            inner,
            ..
        } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
+        }
+        let layer_key = PersistentLayerKey {
+            key_range: self.start_key..end_key,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+            is_delta: false,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            generated_layers.push(SplitWriterResult::Produced(
+                inner.finish_with_end_key(tline, end_key, ctx).await?,
+            ));
+        }
        Ok(generated_layers)
    }

+    #[cfg(test)]
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+            .await
+    }
+
    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
        Ok((self.generated_layers, self.inner))
    }
 }
@@ -110,15 +196,21 @@ impl SplitImageLayerWriter {
 /// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
 /// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
 /// to be cleaned up).
+///
+/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
+/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
+/// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
    inner: DeltaLayerWriter,
    target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
+    generated_layers: Vec<SplitWriterResult>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
    lsn_range: Range<Lsn>,
+    last_key_written: Key,
+    start_key: Key,
 }

 impl SplitDeltaLayerWriter {
@@ -147,9 +239,74 @@ impl SplitDeltaLayerWriter {
            timeline_id,
            tenant_shard_id,
            lsn_range,
+            last_key_written: Key::MIN,
+            start_key,
        })
    }

+    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
+    pub async fn put_value_with_discard_fn<D, F>(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        //
+        // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
+        // strategy. https://github.com/neondatabase/neon/issues/8837
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            if key != self.last_key_written {
+                let next_delta_writer = DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    key,
+                    self.lsn_range.clone(),
+                    ctx,
+                )
+                .await?;
+                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+                let layer_key = PersistentLayerKey {
+                    key_range: self.start_key..key,
+                    lsn_range: self.lsn_range.clone(),
+                    is_delta: true,
+                };
+                self.start_key = key;
+                if discard(&layer_key).await {
+                    drop(prev_delta_writer);
+                    self.generated_layers
+                        .push(SplitWriterResult::Discarded(layer_key));
+                } else {
+                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+                    let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                    self.generated_layers
+                        .push(SplitWriterResult::Produced(delta_layer));
+                }
+            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
+                // We have to produce a very large file b/c a key is updated too often.
+                anyhow::bail!(
+                    "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
+                    key,
+                    self.inner.estimated_size()
+                );
+            }
+        }
+        self.last_key_written = key;
+        self.inner.put_value(key, lsn, val, ctx).await
+    }
+
    pub async fn put_value(
        &mut self,
        key: Key,
@@ -158,56 +315,64 @@ impl SplitDeltaLayerWriter {
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_delta_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                key,
-                self.lsn_range.clone(),
-                ctx,
-            )
-            .await?;
-            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            self.generated_layers.push(delta_layer);
-        }
-        self.inner.put_value(key, lsn, val, ctx).await
+        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
+            .await
    }

-    pub(crate) async fn finish(
+    pub(crate) async fn finish_with_discard_fn<D, F>(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
        let Self {
            mut generated_layers,
            inner,
            ..
        } = self;
-
-        let (desc, path) = inner.finish(end_key, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-        generated_layers.push(delta_layer);
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
+        }
+        let layer_key = PersistentLayerKey {
+            key_range: self.start_key..end_key,
+            lsn_range: self.lsn_range.clone(),
+            is_delta: true,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            let (desc, path) = inner.finish(end_key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(delta_layer));
+        }
        Ok(generated_layers)
    }

-    /// When split writer fails, the caller should call this function and handle partially generated layers.
    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+            .await
+    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
        Ok((self.generated_layers, self.inner))
    }
 }

 #[cfg(test)]
 mod tests {
+    use itertools::Itertools;
    use rand::{RngCore, SeedableRng};

    use crate::{
@@ -302,9 +467,16 @@ mod tests {

    #[tokio::test]
    async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
+        write_split_helper("split_writer_write_split", false).await;
+    }
+
+    #[tokio::test]
+    async fn write_split_discard() {
+        write_split_helper("split_writer_write_split_discard", false).await;
+    }
+
+    async fn write_split_helper(harness_name: &'static str, discard: bool) {
+        let harness = TenantHarness::create(harness_name).await.unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
@@ -338,16 +510,19 @@ mod tests {
        for i in 0..N {
            let i = i as u32;
            image_writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
+                .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
+                    discard
+                })
                .await
                .unwrap();
            delta_writer
-                .put_value(
+                .put_value_with_discard_fn(
                    get_key(i),
                    Lsn(0x20),
                    Value::Image(get_large_img()),
                    &tline,
                    &ctx,
+                    |_| async { discard },
                )
                .await
                .unwrap();
@@ -360,22 +535,39 @@ mod tests {
            .finish(&tline, &ctx, get_key(N as u32))
            .await
            .unwrap();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].layer_desc().key_range.end,
-                    image_layers[idx].layer_desc().key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].layer_desc().key_range.end,
-                    delta_layers[idx].layer_desc().key_range.start
-                );
+        if discard {
+            for layer in image_layers {
+                layer.into_discarded_layer();
+            }
+            for layer in delta_layers {
+                layer.into_discarded_layer();
+            }
+        } else {
+            let image_layers = image_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            let delta_layers = delta_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            assert_eq!(image_layers.len(), N / 512 + 1);
+            assert_eq!(delta_layers.len(), N / 512 + 1);
+            for idx in 0..image_layers.len() {
+                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
+                if idx > 0 {
+                    assert_eq!(
+                        image_layers[idx - 1].layer_desc().key_range.end,
+                        image_layers[idx].layer_desc().key_range.start
+                    );
+                    assert_eq!(
+                        delta_layers[idx - 1].layer_desc().key_range.end,
+                        delta_layers[idx].layer_desc().key_range.start
+                    );
+                }
            }
        }
    }
@@ -456,4 +648,49 @@ mod tests {
            .unwrap();
        assert_eq!(layers.len(), 2);
    }
+
+    #[tokio::test]
+    async fn write_split_single_key() {
+        let harness = TenantHarness::create("split_writer_write_split_single_key")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        const N: usize = 2000;
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        for i in 0..N {
+            let i = i as u32;
+            delta_writer
+                .put_value(
+                    get_key(0),
+                    Lsn(i as u64 * 16 + 0x10),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                )
+                .await
+                .unwrap();
+        }
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(delta_layers.len(), 1);
+    }
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -61,21 +61,12 @@ impl BackgroundLoopKind {
    }
 }

-static PERMIT_GAUGES: once_cell::sync::Lazy<
-    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
-> = once_cell::sync::Lazy::new(|| {
-    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
-    }))
-});
-
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = PERMIT_GAUGES[loop_kind].guard();
+    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);

    pausable_failpoint!(
        "initial-size-calculation-permit-pause",
@@ -98,7 +89,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
-        Some(tenant_shard_id),
+        tenant_shard_id,
        None,
        &format!("compactor for tenant {tenant_shard_id}"),
        {
@@ -121,7 +112,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
+        tenant_shard_id,
        None,
        &format!("garbage collector for tenant {tenant_shard_id}"),
        {
@@ -144,7 +135,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::IngestHousekeeping,
-        Some(tenant_shard_id),
+        tenant_shard_id,
        None,
        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
        {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -44,10 +44,8 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
-    bin_ser::BeSer,
    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
-    vec_map::VecMap,
 };

 use std::pin::pin;
@@ -137,7 +135,10 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
+use super::{
+    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
+    upload_queue::NotInitialized,
+};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -2233,6 +2234,11 @@ impl Timeline {

                handles: Default::default(),
            };
+
+            if aux_file_policy == Some(AuxFilePolicy::V1) {
+                warn!("this timeline is using deprecated aux file policy V1");
+            }
+
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;

@@ -2281,7 +2287,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::LayerFlushTask,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "layer flush task",
            async move {
@@ -2635,7 +2641,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "initial size calculation",
            // NB: don't log errors here, task_mgr will do that.
@@ -2803,7 +2809,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "ondemand logical size calculation",
            async move {
@@ -2996,7 +3002,10 @@ impl Timeline {
        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
        //   the layer is likely to be covered by an image layer during compaction.
        layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
+            std::cmp::Reverse((
+                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
+                desc.lsn_range.end,
+            ))
        });

        let layers = layers
@@ -3589,34 +3598,6 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

-            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
-            // This code path will not be hit during regression tests. After #7099 we have a single partition
-            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
-            // to be fixed.
-
-            // For metadata, always create delta layers.
-            let delta_layer = if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
-                );
-                let metadata_keyspace = &metadata_partition.parts[0];
-                self.create_delta_layer(
-                    &frozen_layer,
-                    Some(
-                        metadata_keyspace.0.ranges.first().unwrap().start
-                            ..metadata_keyspace.0.ranges.last().unwrap().end,
-                    ),
-                    ctx,
-                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
-            } else {
-                None
-            };
-
-            // For image layers, we add them immediately into the layer map.
            let mut layers_to_upload = Vec::new();
            layers_to_upload.extend(
                self.create_image_layers(
@@ -3627,13 +3608,27 @@ impl Timeline {
                )
                .await?,
            );
-
-            if let Some(delta_layer) = delta_layer {
-                layers_to_upload.push(delta_layer.clone());
-                (layers_to_upload, Some(delta_layer))
-            } else {
-                (layers_to_upload, None)
+            if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single metadata keyspace"
+                );
+                layers_to_upload.extend(
+                    self.create_image_layers(
+                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                        // every single key within the keyspace, and therefore, it's safe to force converting it
+                        // into a dense keyspace before calling this function.
+                        &metadata_partition.into_dense(),
+                        self.initdb_lsn,
+                        ImageLayerCreationMode::Initial,
+                        ctx,
+                    )
+                    .await?,
+                );
            }
+
+            (layers_to_upload, None)
        } else {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
@@ -4043,8 +4038,6 @@ impl Timeline {
        mode: ImageLayerCreationMode,
        start: Key,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
-        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
-
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::default();
        let data = self
@@ -4210,15 +4203,13 @@ impl Timeline {
                        "metadata keys must be partitioned separately"
                    );
                }
-                if mode == ImageLayerCreationMode::Initial {
-                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                }
                if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
                    // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
                    // might mess up with evictions.
                    start = img_range.end;
                    continue;
                }
+                // For initial and force modes, we always generate image layers for metadata keys.
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4226,7 +4217,8 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            } else if let ImageLayerCreationMode::Force = mode {
+            }
+            if let ImageLayerCreationMode::Force = mode {
                // When forced to create image layers, we might try and create them where they already
                // exist.  This mode is only used in tests/debug.
                let layers = self.layers.read().await;
@@ -4240,6 +4232,7 @@ impl Timeline {
                        img_range.start,
                        img_range.end
                    );
+                    start = img_range.end;
                    continue;
                }
            }
@@ -4595,7 +4588,7 @@ impl Timeline {
                // for compact_level0_phase1 creating an L0, which does not happen in practice
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
                return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
            } else {
                insert_layers.push(l.clone());
@@ -5162,7 +5155,7 @@ impl Timeline {
        let task_id = task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "download all remote layers task",
            async move {
@@ -5451,12 +5444,17 @@ impl Timeline {
                !(a.end <= b.start || b.end <= a.start)
            }

-            let guard = self.layers.read().await;
-            for layer in guard.layer_map()?.iter_historic_layers() {
-                if layer.is_delta()
-                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                    && layer.lsn_range != deltas.lsn_range
-                {
+            if deltas.key_range.start.next() != deltas.key_range.end {
+                let guard = self.layers.read().await;
+                let mut invalid_layers =
+                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
+                        layer.is_delta()
+                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                        && layer.lsn_range != deltas.lsn_range
+                        // skip single-key layer files
+                        && layer.key_range.start.next() != layer.key_range.end
+                    });
+                if let Some(layer) = invalid_layers.next() {
                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
                    panic!(
                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
@@ -5590,44 +5588,6 @@ enum OpenLayerAction {
 }

 impl<'a> TimelineWriter<'a> {
-    /// Put a new page version that can be constructed from a WAL record
-    ///
-    /// This will implicitly extend the relation, if the page is beyond the
-    /// current end-of-file.
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
-    }
-
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -5733,18 +5693,58 @@ impl<'a> TimelineWriter<'a> {
    }

    /// Put a batch of keys at the specified Lsns.
-    ///
-    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
    pub(crate) async fn put_batch(
        &mut self,
-        batch: VecMap<Lsn, (Key, Value)>,
+        batch: Vec<(CompactKey, Lsn, usize, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        for (lsn, (key, val)) in batch {
-            self.put(key, lsn, &val, ctx).await?
+        if batch.is_empty() {
+            return Ok(());
        }

-        Ok(())
+        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
+        let batch_max_lsn = serialized_batch.max_lsn;
+        let buf_size: u64 = serialized_batch.raw.len() as u64;
+
+        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
+        let layer = self
+            .handle_open_layer_action(batch_max_lsn, action, ctx)
+            .await?;
+
+        let res = layer.put_batch(serialized_batch, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(batch_max_lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
+        }
+
+        res
+    }
+
+    #[cfg(test)]
+    /// Test helper, for tests that would like to poke individual values without composing a batch
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+        let val_ser_size = value.serialized_size().unwrap() as usize;
+        self.put_batch(
+            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
+            ctx,
+        )
+        .await
    }

    pub(crate) async fn delete_batch(
@@ -5885,7 +5885,7 @@ mod tests {
            };

            // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range()) {
+            if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
                assert!(layer_lsn <= last_lsn);
                last_lsn = layer_lsn;
            }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -14,7 +14,7 @@ use super::{
    RecordedDuration, Timeline,
 };

-use anyhow::{anyhow, Context};
+use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
@@ -32,6 +32,9 @@ use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
+use crate::tenant::storage_layer::split_writer::{
+    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
+};
 use crate::tenant::storage_layer::{
    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
@@ -71,15 +74,60 @@ pub(crate) struct KeyHistoryRetention {
 }

 impl KeyHistoryRetention {
+    /// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
+    ///
+    /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
+    /// For example, consider the case where a single delta with range [0x10,0x50) exists.
+    /// And we have branches at LSN 0x10, 0x20, 0x30.
+    /// Then we delete branch @ 0x20.
+    /// Bottom-most compaction may now delete the delta [0x20,0x30).
+    /// And that wouldnt' change the shape of the layer.
+    ///
+    /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
+    ///
+    /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
+    async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
+        if dry_run {
+            return true;
+        }
+        let guard = tline.layers.read().await;
+        if !guard.contains_key(key) {
+            return false;
+        }
+        let layer_generation = guard.get_from_key(key).metadata().generation;
+        drop(guard);
+        if layer_generation == tline.generation {
+            info!(
+                key=%key,
+                ?layer_generation,
+                "discard layer due to duplicated layer key in the same generation",
+            );
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Pipe a history of a single key to the writers.
+    ///
+    /// If `image_writer` is none, the images will be placed into the delta layers.
+    /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
+    #[allow(clippy::too_many_arguments)]
    async fn pipe_to(
        self,
        key: Key,
-        delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        mut image_writer: Option<&mut ImageLayerWriter>,
+        tline: &Arc<Timeline>,
+        delta_writer: &mut SplitDeltaLayerWriter,
+        mut image_writer: Option<&mut SplitImageLayerWriter>,
        stat: &mut CompactionStatistics,
+        dry_run: bool,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
+        let discard = |key: &PersistentLayerKey| {
+            let key = key.clone();
+            async move { Self::discard_key(&key, tline, dry_run).await }
+        };
        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
            if first_batch {
                if logs.len() == 1 && logs[0].1.is_image() {
@@ -88,28 +136,45 @@ impl KeyHistoryRetention {
                    };
                    stat.produce_image_key(img);
                    if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer.put_image(key, img.clone(), ctx).await?;
+                        image_writer
+                            .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
+                            .await?;
                    } else {
-                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
+                        delta_writer
+                            .put_value_with_discard_fn(
+                                key,
+                                cutoff_lsn,
+                                Value::Image(img.clone()),
+                                tline,
+                                ctx,
+                                discard,
+                            )
+                            .await?;
                    }
                } else {
                    for (lsn, val) in logs {
                        stat.produce_key(&val);
-                        delta_writer.push((key, lsn, val));
+                        delta_writer
+                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                            .await?;
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
                    stat.produce_key(&val);
-                    delta_writer.push((key, lsn, val));
+                    delta_writer
+                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                        .await?;
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
            stat.produce_key(&val);
-            delta_writer.push((key, lsn, val));
+            delta_writer
+                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                .await?;
        }
        Ok(())
    }
@@ -1814,11 +1879,27 @@ impl Timeline {
            }
            let mut selected_layers = Vec::new();
            drop(gc_info);
+            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
+            let Some(max_layer_lsn) = layers
+                .iter_historic_layers()
+                .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
+                .map(|desc| desc.get_lsn_range().end)
+                .max()
+            else {
+                info!("no layers to compact with gc");
+                return Ok(());
+            };
+            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
+            // layers to compact.
            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().start <= gc_cutoff {
+                if desc.get_lsn_range().end <= max_layer_lsn {
                    selected_layers.push(guard.get_from_desc(&desc));
                }
            }
+            if selected_layers.is_empty() {
+                info!("no layers to compact with gc");
+                return Ok(());
+            }
            retain_lsns_below_horizon.sort();
            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
        };
@@ -1848,27 +1929,53 @@ impl Timeline {
            lowest_retain_lsn
        );
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        // Also, collect the layer information to decide when to split the new delta layers.
-        let mut downloaded_layers = Vec::new();
-        let mut delta_split_points = BTreeSet::new();
+        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
+        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
        for layer in &layer_selection {
-            let resident_layer = layer.download_and_keep_resident().await?;
-            downloaded_layers.push(resident_layer);
-
            let desc = layer.layer_desc();
            if desc.is_delta() {
-                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
-                // so that we can avoid having too many small delta layers.
-                let key_range = desc.get_key_range();
-                delta_split_points.insert(key_range.start);
-                delta_split_points.insert(key_range.end);
+                // ignore single-key layer files
+                if desc.key_range.start.next() != desc.key_range.end {
+                    let lsn_range = &desc.lsn_range;
+                    lsn_split_point.insert(lsn_range.start);
+                    lsn_split_point.insert(lsn_range.end);
+                }
                stat.visit_delta_layer(desc.file_size());
            } else {
                stat.visit_image_layer(desc.file_size());
            }
        }
+        for layer in &layer_selection {
+            let desc = layer.layer_desc();
+            let key_range = &desc.key_range;
+            if desc.is_delta() && key_range.start.next() != key_range.end {
+                let lsn_range = desc.lsn_range.clone();
+                let intersects = lsn_split_point.range(lsn_range).collect_vec();
+                if intersects.len() > 1 {
+                    bail!(
+                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                        desc.key(),
+                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+                    );
+                }
+            }
+        }
+        // The maximum LSN we are processing in this compaction loop
+        let end_lsn = layer_selection
+            .iter()
+            .map(|l| l.layer_desc().lsn_range.end)
+            .max()
+            .unwrap();
+        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
+        // as an L0 layer.
+        let hack_end_key = Key::NON_L0_MAX;
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
+        let mut downloaded_layers = Vec::new();
+        for layer in &layer_selection {
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+        }
        for resident_layer in &downloaded_layers {
            if resident_layer.layer_desc().is_delta() {
                let layer = resident_layer.get_as_delta(ctx).await?;
@@ -1884,138 +1991,17 @@ impl Timeline {
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

-        enum FlushDeltaResult {
-            /// Create a new resident layer
-            CreateResidentLayer(ResidentLayer),
-            /// Keep an original delta layer
-            KeepLayer(PersistentLayerKey),
-        }
-
-        #[allow(clippy::too_many_arguments)]
-        async fn flush_deltas(
-            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
-            last_key: Key,
-            delta_split_points: &[Key],
-            current_delta_split_point: &mut usize,
-            tline: &Arc<Timeline>,
-            lowest_retain_lsn: Lsn,
-            ctx: &RequestContext,
-            stats: &mut CompactionStatistics,
-            dry_run: bool,
-            last_batch: bool,
-        ) -> anyhow::Result<Option<FlushDeltaResult>> {
-            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
-            // overlapping layers.
-            //
-            // If we have a structure like this:
-            //
-            // | Delta 1 |         | Delta 4 |
-            // |---------| Delta 2 |---------|
-            // | Delta 3 |         | Delta 5 |
-            //
-            // And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
-            // A simple solution here is to split the delta layers using the original boundary, while this
-            // might produce a lot of small layers. This should be improved and fixed in the future.
-            let mut need_split = false;
-            while *current_delta_split_point < delta_split_points.len()
-                && last_key >= delta_split_points[*current_delta_split_point]
-            {
-                *current_delta_split_point += 1;
-                need_split = true;
-            }
-            if !need_split && !last_batch {
-                return Ok(None);
-            }
-            let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
-            if deltas.is_empty() {
-                return Ok(None);
-            }
-            let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
-            let delta_key = PersistentLayerKey {
-                key_range: {
-                    let key_start = deltas.first().unwrap().0;
-                    let key_end = deltas.last().unwrap().0.next();
-                    key_start..key_end
-                },
-                lsn_range: lowest_retain_lsn..end_lsn,
-                is_delta: true,
-            };
-            {
-                // Hack: skip delta layer if we need to produce a layer of a same key-lsn.
-                //
-                // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
-                // For example, consider the case where a single delta with range [0x10,0x50) exists.
-                // And we have branches at LSN 0x10, 0x20, 0x30.
-                // Then we delete branch @ 0x20.
-                // Bottom-most compaction may now delete the delta [0x20,0x30).
-                // And that wouldnt' change the shape of the layer.
-                //
-                // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
-                // That's why it's safe to skip.
-                let guard = tline.layers.read().await;
-
-                if guard.contains_key(&delta_key) {
-                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
-                    drop(guard);
-                    if layer_generation == tline.generation {
-                        stats.discard_delta_layer();
-                        // TODO: depending on whether we design this compaction process to run along with
-                        // other compactions, there could be layer map modifications after we drop the
-                        // layer guard, and in case it creates duplicated layer key, we will still error
-                        // in the end.
-                        info!(
-                            key=%delta_key,
-                            ?layer_generation,
-                            "discard delta layer due to duplicated layer in the same generation"
-                        );
-                        return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
-                    }
-                }
-            }
-
-            let mut delta_layer_writer = DeltaLayerWriter::new(
-                tline.conf,
-                tline.timeline_id,
-                tline.tenant_shard_id,
-                delta_key.key_range.start,
-                lowest_retain_lsn..end_lsn,
-                ctx,
-            )
-            .await?;
-            for (key, lsn, val) in deltas {
-                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-            }
-
-            stats.produce_delta_layer(delta_layer_writer.size());
-            if dry_run {
-                return Ok(None);
-            }
-
-            let (desc, path) = delta_layer_writer
-                .finish(delta_key.key_range.end, ctx)
-                .await?;
-            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
-            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
-        }
-
-        // Hack the key range to be min..(max-1). Otherwise, the image layer will be
-        // interpreted as an L0 delta layer.
-        let hack_image_layer_range = {
-            let mut end_key = Key::MAX;
-            end_key.field6 -= 1;
-            Key::MIN..end_key
-        };
-
        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
        // when some condition meet.
        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
            Some(
-                ImageLayerWriter::new(
+                SplitImageLayerWriter::new(
                    self.conf,
                    self.timeline_id,
                    self.tenant_shard_id,
-                    &hack_image_layer_range, // covers the full key range
+                    Key::MIN,
                    lowest_retain_lsn,
+                    self.get_compaction_target_size(),
                    ctx,
                )
                .await?,
@@ -2024,6 +2010,17 @@ impl Timeline {
            None
        };

+        let mut delta_layer_writer = SplitDeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            Key::MIN,
+            lowest_retain_lsn..end_lsn,
+            self.get_compaction_target_size(),
+            ctx,
+        )
+        .await?;
+
        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2044,47 +2041,11 @@ impl Timeline {
            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
            Ok(Some((key, tline.ancestor_lsn, img)))
        }
-        let image_layer_key = PersistentLayerKey {
-            key_range: hack_image_layer_range,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
-            is_delta: false,
-        };
-
-        // Like with delta layers, it can happen that we re-produce an already existing image layer.
-        // This could happen when a user triggers force compaction and image generation. In this case,
-        // it's always safe to rewrite the layer.
-        let discard_image_layer = {
-            let guard = self.layers.read().await;
-            if guard.contains_key(&image_layer_key) {
-                let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
-                drop(guard);
-                if layer_generation == self.generation {
-                    // TODO: depending on whether we design this compaction process to run along with
-                    // other compactions, there could be layer map modifications after we drop the
-                    // layer guard, and in case it creates duplicated layer key, we will still error
-                    // in the end.
-                    info!(
-                        key=%image_layer_key,
-                        ?layer_generation,
-                        "discard image layer due to duplicated layer key in the same generation",
-                    );
-                    true
-                } else {
-                    false
-                }
-            } else {
-                false
-            }
-        };

        // Actually, we can decide not to write to the image layer at all at this point because
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

-        let mut delta_values = Vec::new();
-        let delta_split_points = delta_split_points.into_iter().collect_vec();
-        let mut current_delta_split_point = 0;
-        let mut delta_layers = Vec::new();
        while let Some((key, lsn, val)) = merge_iter.next().await? {
            if cancel.is_cancelled() {
                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
@@ -2115,27 +2076,14 @@ impl Timeline {
                retention
                    .pipe_to(
                        *last_key,
-                        &mut delta_values,
+                        self,
+                        &mut delta_layer_writer,
                        image_layer_writer.as_mut(),
                        &mut stat,
+                        dry_run,
                        ctx,
                    )
                    .await?;
-                delta_layers.extend(
-                    flush_deltas(
-                        &mut delta_values,
-                        *last_key,
-                        &delta_split_points,
-                        &mut current_delta_split_point,
-                        self,
-                        lowest_retain_lsn,
-                        ctx,
-                        &mut stat,
-                        dry_run,
-                        false,
-                    )
-                    .await?,
-                );
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
@@ -2159,43 +2107,75 @@ impl Timeline {
        retention
            .pipe_to(
                last_key,
-                &mut delta_values,
+                self,
+                &mut delta_layer_writer,
                image_layer_writer.as_mut(),
                &mut stat,
+                dry_run,
                ctx,
            )
            .await?;
-        delta_layers.extend(
-            flush_deltas(
-                &mut delta_values,
-                last_key,
-                &delta_split_points,
-                &mut current_delta_split_point,
-                self,
-                lowest_retain_lsn,
-                ctx,
-                &mut stat,
-                dry_run,
-                true,
-            )
-            .await?,
-        );
-        assert!(delta_values.is_empty(), "unprocessed keys");

-        let image_layer = if discard_image_layer {
-            stat.discard_image_layer();
-            None
-        } else if let Some(writer) = image_layer_writer {
-            stat.produce_image_layer(writer.size());
+        let discard = |key: &PersistentLayerKey| {
+            let key = key.clone();
+            async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
+        };
+
+        let produced_image_layers = if let Some(writer) = image_layer_writer {
            if !dry_run {
-                Some(writer.finish(self, ctx).await?)
+                writer
+                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                    .await?
            } else {
-                None
+                let (layers, _) = writer.take()?;
+                assert!(layers.is_empty(), "image layers produced in dry run mode?");
+                Vec::new()
            }
        } else {
-            None
+            Vec::new()
        };

+        let produced_delta_layers = if !dry_run {
+            delta_layer_writer
+                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                .await?
+        } else {
+            let (layers, _) = delta_layer_writer.take()?;
+            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
+            Vec::new()
+        };
+
+        let mut compact_to = Vec::new();
+        let mut keep_layers = HashSet::new();
+        let produced_delta_layers_len = produced_delta_layers.len();
+        let produced_image_layers_len = produced_image_layers.len();
+        for action in produced_delta_layers {
+            match action {
+                SplitWriterResult::Produced(layer) => {
+                    stat.produce_delta_layer(layer.layer_desc().file_size());
+                    compact_to.push(layer);
+                }
+                SplitWriterResult::Discarded(l) => {
+                    keep_layers.insert(l);
+                    stat.discard_delta_layer();
+                }
+            }
+        }
+        for action in produced_image_layers {
+            match action {
+                SplitWriterResult::Produced(layer) => {
+                    stat.produce_image_layer(layer.layer_desc().file_size());
+                    compact_to.push(layer);
+                }
+                SplitWriterResult::Discarded(l) => {
+                    keep_layers.insert(l);
+                    stat.discard_image_layer();
+                }
+            }
+        }
+        let mut layer_selection = layer_selection;
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+
        info!(
            "gc-compaction statistics: {}",
            serde_json::to_string(&stat)?
@@ -2206,28 +2186,11 @@ impl Timeline {
        }

        info!(
-            "produced {} delta layers and {} image layers",
-            delta_layers.len(),
-            if image_layer.is_some() { 1 } else { 0 }
+            "produced {} delta layers and {} image layers, {} layers are kept",
+            produced_delta_layers_len,
+            produced_image_layers_len,
+            layer_selection.len()
        );
-        let mut compact_to = Vec::new();
-        let mut keep_layers = HashSet::new();
-        for action in delta_layers {
-            match action {
-                FlushDeltaResult::CreateResidentLayer(layer) => {
-                    compact_to.push(layer);
-                }
-                FlushDeltaResult::KeepLayer(l) => {
-                    keep_layers.insert(l);
-                }
-            }
-        }
-        if discard_image_layer {
-            keep_layers.insert(image_layer_key);
-        }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        compact_to.extend(image_layer);

        // Step 3: Place back to the layer map.
        {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
+            tenant_shard_id,
            Some(timeline_id),
            "timeline_delete",
            async move {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            &format!(
                "layer eviction for {}/{}",
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr::TaskKind,
-    task_mgr::WALRECEIVER_RUNTIME,
+    pgdatadir_mapping::DatadirModification,
+    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
@@ -345,7 +345,10 @@ pub(super) async fn handle_walreceiver_connection(
                        // Commit every ingest_batch_size records. Even if we filtered out
                        // all records, we still need to call commit to advance the LSN.
                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
+                        if uncommitted_records >= ingest_batch_size
+                            || modification.approx_pending_bytes()
+                                > DatadirModification::MAX_PENDING_BYTES
+                        {
                            WAL_INGEST
                                .records_committed
                                .inc_by(uncommitted_records - filtered_records);
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;

 use pageserver_api::models::PageserverUtilization;

-use crate::{config::PageServerConf, tenant::mgr::TenantManager};
+use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};

 pub(crate) fn regenerate(
    conf: &PageServerConf,
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
        disk_usable_pct,
        shard_count,
        max_shard_count: MAX_SHARDS,
-        utilization_score: 0,
+        utilization_score: None,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };

-    doc.refresh_score();
-
-    // TODO: make utilization_score into a metric
+    // Initialize `PageserverUtilization::utilization_score`
+    let score = doc.cached_score();
+    NODE_UTILIZATION_SCORE.set(score);

    Ok(doc)
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -756,11 +756,23 @@ impl VirtualFile {
        })
    }

+    /// The function aborts the process if the error is fatal.
    async fn write_at<B: IoBuf + Send>(
        &self,
        buf: FullSlice<B>,
        offset: u64,
        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (FullSlice<B>, Result<usize, Error>) {
+        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
+        let result = result.maybe_fatal_err("write_at");
+        (slice, result)
+    }
+
+    async fn write_at_inner<B: IoBuf + Send>(
+        &self,
+        buf: FullSlice<B>,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -110,7 +110,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)

 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_SHARED);
+		/* We need exclusive lock here because of LRU list manipulation */
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
 		if (entry != NULL)
 		{
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -2,6 +2,7 @@

 import argparse
 import enum
+import os
 import subprocess
 import sys
 from typing import List
@@ -93,7 +94,7 @@ if __name__ == "__main__":
        "--no-color",
        action="store_true",
        help="disable colored output",
-        default=not sys.stdout.isatty(),
+        default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
    )
    args = parser.parse_args()

--- a/proxy/README.md
+++ b/proxy/README.md
@@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation
 ```

 If both postgres and proxy are running you may send a SQL query:
-```json
+```console
 curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
  -H 'Content-Type: application/json' \
@@ -44,7 +44,8 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
    "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
    "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
  }' | jq
-
+```
+```json
 {
  "command": "SELECT",
  "fields": [
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -113,38 +113,36 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {

 impl UserFacingError for AuthError {
    fn to_string_client(&self) -> String {
-        use AuthErrorImpl::*;
        match self.0.as_ref() {
-            Link(e) => e.to_string_client(),
-            GetAuthInfo(e) => e.to_string_client(),
-            Sasl(e) => e.to_string_client(),
-            AuthFailed(_) => self.to_string(),
-            BadAuthMethod(_) => self.to_string(),
-            MalformedPassword(_) => self.to_string(),
-            MissingEndpointName => self.to_string(),
-            Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed(_) => self.to_string(),
-            TooManyConnections => self.to_string(),
-            UserTimeout(_) => self.to_string(),
+            AuthErrorImpl::Link(e) => e.to_string_client(),
+            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
+            AuthErrorImpl::Sasl(e) => e.to_string_client(),
+            AuthErrorImpl::AuthFailed(_) => self.to_string(),
+            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
+            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
+            AuthErrorImpl::MissingEndpointName => self.to_string(),
+            AuthErrorImpl::Io(_) => "Internal error".to_string(),
+            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
+            AuthErrorImpl::TooManyConnections => self.to_string(),
+            AuthErrorImpl::UserTimeout(_) => self.to_string(),
        }
    }
 }

 impl ReportableError for AuthError {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        use AuthErrorImpl::*;
        match self.0.as_ref() {
-            Link(e) => e.get_error_kind(),
-            GetAuthInfo(e) => e.get_error_kind(),
-            Sasl(e) => e.get_error_kind(),
-            AuthFailed(_) => crate::error::ErrorKind::User,
-            BadAuthMethod(_) => crate::error::ErrorKind::User,
-            MalformedPassword(_) => crate::error::ErrorKind::User,
-            MissingEndpointName => crate::error::ErrorKind::User,
-            Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            TooManyConnections => crate::error::ErrorKind::RateLimit,
-            UserTimeout(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::Link(e) => e.get_error_kind(),
+            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
+            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
+            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
+            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
        }
    }
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,6 +2,7 @@ mod classic;
 mod hacks;
 pub mod jwt;
 mod link;
+pub mod local;

 use std::net::IpAddr;
 use std::sync::Arc;
@@ -9,6 +10,7 @@ use std::time::Duration;

 use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
@@ -68,6 +70,8 @@ pub enum BackendType<'a, T, D> {
    Console(MaybeOwned<'a, ConsoleBackend>, T),
    /// Authentication via a web browser.
    Link(MaybeOwned<'a, url::ApiUrl>, D),
+    /// Local proxy uses configured auth credentials and does not wake compute
+    Local(MaybeOwned<'a, LocalBackend>),
 }

 pub trait TestBackend: Send + Sync + 'static {
@@ -80,9 +84,8 @@ pub trait TestBackend: Send + Sync + 'static {

 impl std::fmt::Display for BackendType<'_, (), ()> {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use BackendType::*;
        match self {
-            Console(api, _) => match &**api {
+            Self::Console(api, ()) => match &**api {
                ConsoleBackend::Console(endpoint) => {
                    fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                }
@@ -93,7 +96,8 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                #[cfg(test)]
                ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
            },
-            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Link(url, ()) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Local(_) => fmt.debug_tuple("Local").finish(),
        }
    }
 }
@@ -102,10 +106,10 @@ impl<T, D> BackendType<'_, T, D> {
    /// Very similar to [`std::option::Option::as_ref`].
    /// This helps us pass structured config to async tasks.
    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
-        use BackendType::*;
        match self {
-            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
+            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
+            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
+            Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
        }
    }
 }
@@ -115,10 +119,10 @@ impl<'a, T, D> BackendType<'a, T, D> {
    /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
    /// a function to a contained value.
    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
-        use BackendType::*;
        match self {
-            Console(c, x) => Console(c, f(x)),
-            Link(c, x) => Link(c, x),
+            Self::Console(c, x) => BackendType::Console(c, f(x)),
+            Self::Link(c, x) => BackendType::Link(c, x),
+            Self::Local(l) => BackendType::Local(l),
        }
    }
 }
@@ -126,10 +130,10 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
    /// Very similar to [`std::option::Option::transpose`].
    /// This is most useful for error handling.
    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
-        use BackendType::*;
        match self {
-            Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c, x) => Ok(Link(c, x)),
+            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
+            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
+            Self::Local(l) => Ok(BackendType::Local(l)),
        }
    }
 }
@@ -161,6 +165,7 @@ impl ComputeUserInfo {
 pub enum ComputeCredentialKeys {
    Password(Vec<u8>),
    AuthKeys(AuthKeys),
+    None,
 }

 impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
@@ -293,7 +298,9 @@ async fn auth_quirks(
            ctx.set_endpoint_id(res.info.endpoint.clone());
            let password = match res.keys {
                ComputeCredentialKeys::Password(p) => p,
-                _ => unreachable!("password hack should return a password"),
+                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
+                    unreachable!("password hack should return a password")
+                }
            };
            (res.info, Some(password))
        }
@@ -317,21 +324,20 @@ async fn auth_quirks(
    };
    let (cached_entry, secret) = cached_secret.take_value();

-    let secret = match secret {
-        Some(secret) => config.check_rate_limit(
+    let secret = if let Some(secret) = secret {
+        config.check_rate_limit(
            ctx,
            config,
            secret,
            &info.endpoint,
            unauthenticated_password.is_some() || allow_cleartext,
-        )?,
-        None => {
-            // If we don't have an authentication secret, we mock one to
-            // prevent malicious probing (possible due to missing protocol steps).
-            // This mocked secret will never lead to successful authentication.
-            info!("authentication info not found, mocking it");
-            AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
-        }
+        )?
+    } else {
+        // If we don't have an authentication secret, we mock one to
+        // prevent malicious probing (possible due to missing protocol steps).
+        // This mocked secret will never lead to successful authentication.
+        info!("authentication info not found, mocking it");
+        AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
    };

    match authenticate_with_secret(
@@ -400,21 +406,19 @@ async fn authenticate_with_secret(
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
    /// Get compute endpoint name from the credentials.
    pub fn get_endpoint(&self) -> Option<EndpointId> {
-        use BackendType::*;
-
        match self {
-            Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_, _) => Some("link".into()),
+            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
+            Self::Link(_, ()) => Some("link".into()),
+            Self::Local(_) => Some("local".into()),
        }
    }

    /// Get username from the credentials.
    pub fn get_user(&self) -> &str {
-        use BackendType::*;
-
        match self {
-            Console(_, user_info) => &user_info.user,
-            Link(_, _) => "link",
+            Self::Console(_, user_info) => &user_info.user,
+            Self::Link(_, ()) => "link",
+            Self::Local(_) => "local",
        }
    }

@@ -428,10 +432,8 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
        config: &'static AuthenticationConfig,
        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
-        use BackendType::*;
-
        let res = match self {
-            Console(api, user_info) => {
+            Self::Console(api, user_info) => {
                info!(
                    user = &*user_info.user,
                    project = user_info.endpoint(),
@@ -451,13 +453,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                BackendType::Console(api, credentials)
            }
            // NOTE: this auth backend doesn't use client credentials.
-            Link(url, _) => {
+            Self::Link(url, ()) => {
                info!("performing link authentication");

                let info = link::authenticate(ctx, &url, client).await?;

                BackendType::Link(url, info)
            }
+            Self::Local(_) => {
+                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
+            }
        };

        info!("user successfully authenticated");
@@ -470,10 +475,10 @@ impl BackendType<'_, ComputeUserInfo, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        use BackendType::*;
        match self {
-            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Self::Link(_, ()) => Ok(Cached::new_uncached(None)),
+            Self::Local(_) => Ok(Cached::new_uncached(None)),
        }
    }

@@ -481,10 +486,10 @@ impl BackendType<'_, ComputeUserInfo, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        use BackendType::*;
        match self {
-            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Link(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
        }
    }
 }
@@ -495,18 +500,18 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
        match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
        }
    }

-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
        match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
        }
    }
 }
@@ -517,18 +522,18 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
        match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, ()) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
        }
    }

-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
        match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, ()) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
        }
    }
 }
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,15 +1,21 @@
-use std::{future::Future, sync::Arc, time::Duration};
+use std::{
+    future::Future,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};

 use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
+use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;

-use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};

 // TODO(conrad): make these configurable.
+const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
@@ -17,30 +23,56 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;

 /// How to get the JWT auth rules
 pub trait FetchAuthRules: Clone + Send + Sync + 'static {
-    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+    fn fetch_auth_rules(
+        &self,
+        role_name: RoleName,
+    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }

-#[derive(Clone)]
-struct FetchAuthRulesFromCplane {
-    #[allow(dead_code)]
-    endpoint: EndpointIdInt,
-}
-
-impl FetchAuthRules for FetchAuthRulesFromCplane {
-    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-        Err(anyhow::anyhow!("not yet implemented"))
-    }
-}
-
-pub struct AuthRules {
-    jwks_urls: Vec<url::Url>,
+pub struct AuthRule {
+    pub id: String,
+    pub jwks_url: url::Url,
+    pub audience: Option<String>,
 }

 #[derive(Default)]
 pub struct JwkCache {
    client: reqwest::Client,

-    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<String, KeySet>,
+}
+
+impl JwkCacheEntry {
+    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
+        self.key_sets.values().find_map(|key_set| {
+            key_set
+                .find_key(key_id)
+                .map(|jwk| (jwk, key_set.audience.as_deref()))
+        })
+    }
+}
+
+struct KeySet {
+    jwks: jose_jwk::JwkSet,
+    audience: Option<String>,
+}
+
+impl KeySet {
+    fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
+        self.jwks
+            .keys
+            .iter()
+            .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
+    }
 }

 pub struct JwkCacheEntryLock {
@@ -57,15 +89,6 @@ impl Default for JwkCacheEntryLock {
    }
 }

-pub struct JwkCacheEntry {
-    /// Should refetch at least every hour to verify when old keys have been removed.
-    /// Should refetch when new key IDs are seen only every 5 minutes or so
-    last_retrieved: Instant,
-
-    /// cplane will return multiple JWKs urls that we need to scrape.
-    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
-}
-
 impl JwkCacheEntryLock {
    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
        JwkRenewalPermit::acquire_permit(self).await
@@ -79,6 +102,7 @@ impl JwkCacheEntryLock {
        &self,
        _permit: JwkRenewalPermit<'_>,
        client: &reqwest::Client,
+        role_name: RoleName,
        auth_rules: &F,
    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
        // double check that no one beat us to updating the cache.
@@ -91,20 +115,19 @@ impl JwkCacheEntryLock {
            }
        }

-        let rules = auth_rules.fetch_auth_rules().await?;
-        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
-            rules.jwks_urls.len(),
-            ahash::RandomState::new(),
-        );
+        let rules = auth_rules.fetch_auth_rules(role_name).await?;
+        let mut key_sets =
+            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
        // TODO(conrad): run concurrently
        // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
-        for url in rules.jwks_urls {
-            let req = client.get(url.clone());
+        for rule in rules {
+            let req = client.get(rule.jwks_url.clone());
            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
            match req.send().await.and_then(|r| r.error_for_status()) {
                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                // I expect these failures would be quite sparse.
-                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
                Ok(r) => {
                    let resp: http::Response<reqwest::Body> = r.into();
                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
@@ -113,9 +136,17 @@ impl JwkCacheEntryLock {
                    )
                    .await
                    {
-                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Err(e) => {
+                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
+                        }
                        Ok(jwks) => {
-                            key_sets.insert(url, jwks);
+                            key_sets.insert(
+                                rule.id,
+                                KeySet {
+                                    jwks,
+                                    audience: rule.audience,
+                                },
+                            );
                        }
                    }
                }
@@ -133,7 +164,9 @@ impl JwkCacheEntryLock {

    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
        self: &Arc<Self>,
+        ctx: &RequestMonitoring,
        client: &reqwest::Client,
+        role_name: RoleName,
        fetch: &F,
    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
        let now = Instant::now();
@@ -141,18 +174,20 @@ impl JwkCacheEntryLock {

        // if we have no cached JWKs, try and get some
        let Some(cached) = guard else {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
        };

        let last_update = now.duration_since(cached.last_retrieved);

        // check if the cached JWKs need updating.
        if last_update > MAX_RENEW {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
            let permit = self.acquire_permit().await;

            // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
        }

        // every 5 minutes we should spawn a job to eagerly update the token.
@@ -164,7 +199,7 @@ impl JwkCacheEntryLock {
                let client = client.clone();
                let fetch = fetch.clone();
                tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
                    }
                });
@@ -178,8 +213,10 @@ impl JwkCacheEntryLock {

    async fn check_jwt<F: FetchAuthRules>(
        self: &Arc<Self>,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        jwt: &str,
        client: &reqwest::Client,
+        role_name: RoleName,
        fetch: &F,
    ) -> Result<(), anyhow::Error> {
        // JWT compact form is defined to be
@@ -187,38 +224,38 @@ impl JwkCacheEntryLock {
        // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);

        let (header_payload, signature) = jwt
-            .rsplit_once(".")
+            .rsplit_once('.')
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let (header, _payload) = header_payload
-            .split_once(".")
+        let (header, payload) = header_payload
+            .split_once('.')
            .context("Provided authentication token is not a valid JWT encoding")?;

        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader>(&header)
+        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
            .context("Provided authentication token is not a valid JWT encoding")?;

        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;

        ensure!(header.typ == "JWT");
-        let kid = header.kid.context("missing key id")?;
+        let kid = header.key_id.context("missing key id")?;

-        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+        let mut guard = self
+            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
+            .await?;

        // get the key from the JWKs if possible. If not, wait for the keys to update.
-        let jwk = loop {
-            let jwk = guard
-                .key_sets
-                .values()
-                .flat_map(|jwks| &jwks.keys)
-                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
-
-            match jwk {
+        let (jwk, expected_audience) = loop {
+            match guard.find_jwk_and_audience(kid) {
                Some(jwk) => break jwk,
                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+
                    let permit = self.acquire_permit().await;
-                    guard = self.renew_jwks(permit, client, fetch).await?;
+                    guard = self
+                        .renew_jwks(permit, client, role_name.clone(), fetch)
+                        .await?;
                }
                _ => {
                    bail!("jwk not found");
@@ -227,7 +264,7 @@ impl JwkCacheEntryLock {
        };

        ensure!(
-            jwk.is_supported(&header.alg),
+            jwk.is_supported(&header.algorithm),
            "signature algorithm not supported"
        );

@@ -241,31 +278,57 @@ impl JwkCacheEntryLock {
            key => bail!("unsupported key type {key:?}"),
        };

-        // TODO(conrad): verify iss, exp, nbf, etc...
+        let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        tracing::debug!(?payload, "JWT signature valid with claims");
+
+        match (expected_audience, payload.audience) {
+            // check the audience matches
+            (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
+            // the audience is expected but is missing
+            (Some(_), None) => bail!("invalid JWT token audience"),
+            // we don't care for the audience field
+            (None, _) => {}
+        }
+
+        let now = SystemTime::now();
+
+        if let Some(exp) = payload.expiration {
+            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
+        }
+
+        if let Some(nbf) = payload.not_before {
+            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
+        }

        Ok(())
    }
 }

 impl JwkCache {
-    pub async fn check_jwt(
+    pub async fn check_jwt<F: FetchAuthRules>(
        &self,
-        endpoint: EndpointIdInt,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+        role_name: RoleName,
+        fetch: &F,
+        jwt: &str,
    ) -> Result<(), anyhow::Error> {
        // try with just a read lock first
-        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
-        let entry = match entry {
-            Some(entry) => entry,
-            None => {
-                // acquire a write lock after to insert.
-                let entry = self.map.entry(endpoint).or_default();
-                Arc::clone(&*entry)
-            }
-        };
+        let key = (endpoint, role_name.clone());
+        let entry = self.map.get(&key).as_deref().map(Arc::clone);
+        let entry = entry.unwrap_or_else(|| {
+            // acquire a write lock after to insert.
+            let entry = self.map.entry(key).or_default();
+            Arc::clone(&*entry)
+        });

-        let fetch = FetchAuthRulesFromCplane { endpoint };
-        entry.check_jwt(jwt, &self.client, &fetch).await
+        entry
+            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
+            .await
    }
 }

@@ -315,13 +378,49 @@ fn verify_rsa_signature(

 /// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
 #[derive(serde::Deserialize, serde::Serialize)]
-struct JWTHeader<'a> {
+struct JwtHeader<'a> {
    /// must be "JWT"
+    #[serde(rename = "typ")]
    typ: &'a str,
    /// must be a supported alg
-    alg: jose_jwa::Algorithm,
+    #[serde(rename = "alg")]
+    algorithm: jose_jwa::Algorithm,
    /// key id, must be provided for our usecase
-    kid: Option<&'a str>,
+    #[serde(rename = "kid")]
+    key_id: Option<&'a str>,
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize, Debug)]
+struct JwtPayload<'a> {
+    /// Audience - Recipient for which the JWT is intended
+    #[serde(rename = "aud")]
+    audience: Option<&'a str>,
+    /// Expiration - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
+    expiration: Option<SystemTime>,
+    /// Not before - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
+    not_before: Option<SystemTime>,
+
+    // the following entries are only extracted for the sake of debug logging.
+    /// Issuer of the JWT
+    #[serde(rename = "iss")]
+    issuer: Option<&'a str>,
+    /// Subject of the JWT (the user)
+    #[serde(rename = "sub")]
+    subject: Option<&'a str>,
+    /// Unique token identifier
+    #[serde(rename = "jti")]
+    jwt_id: Option<&'a str>,
+    /// Unique session identifier
+    #[serde(rename = "sid")]
+    session_id: Option<&'a str>,
+}
+
+fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
+    let d = <Option<u64>>::deserialize(d)?;
+    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
 }

 struct JwkRenewalPermit<'a> {
@@ -340,7 +439,7 @@ impl JwkRenewalPermit<'_> {
        }
    }

-    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
        match from.lookup.acquire().await {
            Ok(permit) => {
                permit.forget();
@@ -352,7 +451,7 @@ impl JwkRenewalPermit<'_> {
        }
    }

-    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
        match from.lookup.try_acquire() {
            Ok(permit) => {
                permit.forget();
@@ -388,6 +487,8 @@ impl Drop for JwkRenewalPermit<'_> {

 #[cfg(test)]
 mod tests {
+    use crate::RoleName;
+
    use super::*;

    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
@@ -431,10 +532,10 @@ mod tests {
    }

    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
-        let header = JWTHeader {
+        let header = JwtHeader {
            typ: "JWT",
-            alg: jose_jwa::Algorithm::Signing(sig),
-            kid: Some(&kid),
+            algorithm: jose_jwa::Algorithm::Signing(sig),
+            key_id: Some(&kid),
        };
        let body = typed_json::json! {{
            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
@@ -524,33 +625,40 @@ mod tests {
        struct Fetch(SocketAddr);

        impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-                Ok(AuthRules {
-                    jwks_urls: vec![
-                        format!("http://{}/foo", self.0).parse().unwrap(),
-                        format!("http://{}/bar", self.0).parse().unwrap(),
-                    ],
-                })
+            async fn fetch_auth_rules(
+                &self,
+                _role_name: RoleName,
+            ) -> anyhow::Result<Vec<AuthRule>> {
+                Ok(vec![
+                    AuthRule {
+                        id: "foo".to_owned(),
+                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                    AuthRule {
+                        id: "bar".to_owned(),
+                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                ])
            }
        }

+        let role_name = RoleName::from("user");
+
        let jwk_cache = Arc::new(JwkCacheEntryLock::default());

-        jwk_cache
-            .check_jwt(jwt1, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt2, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt3, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt4, &client, &Fetch(addr))
-            .await
-            .unwrap();
+        for token in [jwt1, jwt2, jwt3, jwt4] {
+            jwk_cache
+                .check_jwt(
+                    &RequestMonitoring::test(),
+                    &token,
+                    &client,
+                    role_name.clone(),
+                    &Fetch(addr),
+                )
+                .await
+                .unwrap();
+        }
    }
 }
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -0,0 +1,79 @@
+use std::{collections::HashMap, net::SocketAddr};
+
+use anyhow::Context;
+use arc_swap::ArcSwapOption;
+
+use crate::{
+    compute::ConnCfg,
+    console::{
+        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
+        NodeInfo,
+    },
+    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
+    RoleName,
+};
+
+use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
+
+pub struct LocalBackend {
+    pub jwks_cache: JwkCache,
+    pub postgres_addr: SocketAddr,
+    pub node_info: NodeInfo,
+}
+
+impl LocalBackend {
+    pub fn new(postgres_addr: SocketAddr) -> Self {
+        LocalBackend {
+            jwks_cache: JwkCache::default(),
+            postgres_addr,
+            node_info: NodeInfo {
+                config: {
+                    let mut cfg = ConnCfg::new();
+                    cfg.host(&postgres_addr.ip().to_string());
+                    cfg.port(postgres_addr.port());
+                    cfg
+                },
+                // TODO(conrad): make this better reflect compute info rather than endpoint info.
+                aux: MetricsAuxInfo {
+                    endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
+                    project_id: ProjectIdTag::get_interner().get_or_intern("local"),
+                    branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    cold_start_info: ColdStartInfo::WarmCached,
+                },
+                allow_self_signed_compute: false,
+            },
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct StaticAuthRules;
+
+pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
+
+#[derive(Debug, Clone)]
+pub struct JwksRoleSettings {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+}
+
+impl FetchAuthRules for StaticAuthRules {
+    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
+        let mappings = JWKS_ROLE_MAP.load();
+        let role_mappings = mappings
+            .as_deref()
+            .and_then(|m| m.roles.get(&role_name))
+            .context("JWKs settings for this role were not configured")?;
+        let mut rules = vec![];
+        for setting in &role_mappings.jwks {
+            rules.push(AuthRule {
+                id: setting.id.clone(),
+                jwks_url: setting.jwks_url.clone(),
+                audience: setting.jwt_audience.clone(),
+            });
+        }
+
+        Ok(rules)
+    }
+}
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -89,10 +89,12 @@ impl ComputeUserInfoMaybeEndpoint {
        sni: Option<&str>,
        common_names: Option<&HashSet<String>>,
    ) -> Result<Self, ComputeUserInfoParseError> {
-        use ComputeUserInfoParseError::*;
-
        // Some parameters are stored in the startup message.
-        let get_param = |key| params.get(key).ok_or(MissingKey(key));
+        let get_param = |key| {
+            params
+                .get(key)
+                .ok_or(ComputeUserInfoParseError::MissingKey(key))
+        };
        let user: RoleName = get_param("user")?.into();

        // Project name might be passed via PG's command-line options.
@@ -122,12 +124,18 @@ impl ComputeUserInfoMaybeEndpoint {
        let endpoint = match (endpoint_option, endpoint_from_domain) {
            // Invariant: if we have both project name variants, they should match.
            (Some(option), Some(domain)) if option != domain => {
-                Some(Err(InconsistentProjectNames { domain, option }))
+                Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
+                    domain,
+                    option,
+                }))
            }
            // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(MalformedProjectName(name)),
-                true => Ok(name),
+            (a, b) => a.or(b).map(|name| {
+                if project_name_valid(name.as_ref()) {
+                    Ok(name)
+                } else {
+                    Err(ComputeUserInfoParseError::MalformedProjectName(name))
+                }
            }),
        }
        .transpose()?;
@@ -186,7 +194,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
        impl<'de> serde::de::Visitor<'de> for StrVisitor {
            type Value = IpPattern;

-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
            }

--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -0,0 +1,316 @@
+use std::{
+    net::SocketAddr,
+    path::{Path, PathBuf},
+    pin::pin,
+    sync::Arc,
+    time::Duration,
+};
+
+use anyhow::{bail, ensure};
+use dashmap::DashMap;
+use futures::{future::Either, FutureExt};
+use proxy::{
+    auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
+    cancellation::CancellationHandlerMain,
+    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
+    console::{locks::ApiLocks, messages::JwksRoleMapping},
+    http::health_server::AppMetrics,
+    metrics::{Metrics, ThreadPoolMetrics},
+    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
+    scram::threadpool::ThreadPool,
+    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
+};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+use clap::Parser;
+use tokio::{net::TcpListener, task::JoinSet};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+/// Neon proxy/router
+#[derive(Parser)]
+#[command(version = GIT_VERSION, about)]
+struct LocalProxyCliArgs {
+    /// listen for incoming metrics connections on ip:port
+    #[clap(long, default_value = "127.0.0.1:7001")]
+    metrics: String,
+    /// listen for incoming http connections on ip:port
+    #[clap(long)]
+    http: String,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
+    /// User rate limiter max number of requests per second.
+    ///
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    user_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Address of the postgres server
+    #[clap(long, default_value = "127.0.0.1:5432")]
+    compute: SocketAddr,
+    /// File address of the local proxy config file
+    #[clap(long, default_value = "./localproxy.json")]
+    config_path: PathBuf,
+}
+
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 200)]
+    sql_over_http_pool_max_total_conns: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    #[clap(long, default_value_t = 100)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 16)]
+    sql_over_http_cancel_set_shards: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
+    info!("Version: {GIT_VERSION}");
+    info!("Build_tag: {BUILD_TAG}");
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
+
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
+        }
+    };
+
+    let args = LocalProxyCliArgs::parse();
+    let config = build_config(&args)?;
+
+    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
+    let http_listener = TcpListener::bind(args.http).await?;
+    let shutdown = CancellationToken::new();
+
+    // todo: should scale with CU
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+        LeakyBucketConfig {
+            rps: 10.0,
+            max: 100.0,
+        },
+        16,
+    ));
+
+    refresh_config(args.config_path.clone()).await;
+
+    let mut maintenance_tasks = JoinSet::new();
+    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
+        refresh_config(args.config_path.clone()).map(Ok)
+    }));
+    maintenance_tasks.spawn(proxy::http::health_server::task_main(
+        metrics_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
+
+    let task = serverless::task_main(
+        config,
+        http_listener,
+        shutdown.clone(),
+        Arc::new(CancellationHandlerMain::new(
+            Arc::new(DashMap::new()),
+            None,
+            proxy::metrics::CancellationSource::Local,
+        )),
+        endpoint_rate_limiter,
+    );
+
+    match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
+        // exit immediately on maintenance task completion
+        Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
+        // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
+        Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
+        // exit immediately on client task error
+        Either::Right((res, _)) => res?,
+    }
+
+    Ok(())
+}
+
+/// ProxyConfig is created at proxy startup, and lives forever.
+fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let config::ConcurrencyLockOptions {
+        shards,
+        limiter,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
+    let connect_compute_locks = ApiLocks::new(
+        "connect_compute_lock",
+        limiter,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    )?;
+
+    let http_config = HttpConfig {
+        accept_websockets: false,
+        pool_options: GlobalConnPoolOptions {
+            gc_epoch: Duration::from_secs(60),
+            pool_shards: 2,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: false,
+
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
+        },
+        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
+        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+    };
+
+    Ok(Box::leak(Box::new(ProxyConfig {
+        tls_config: None,
+        auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
+            LocalBackend::new(args.compute),
+        )),
+        metric_collection: None,
+        allow_self_signed_compute: false,
+        http_config,
+        authentication_config: AuthenticationConfig {
+            thread_pool: ThreadPool::new(0),
+            scram_protocol_timeout: Duration::from_secs(10),
+            rate_limiter_enabled: false,
+            rate_limiter: BucketRateLimiter::new(vec![]),
+            rate_limit_ip_subnet: 64,
+        },
+        require_client_ip: false,
+        handshake_timeout: Duration::from_secs(10),
+        region: "local".into(),
+        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
+        connect_compute_locks,
+        connect_to_compute_retry_config: RetryConfig::parse(
+            RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES,
+        )?,
+    })))
+}
+
+async fn refresh_config(path: PathBuf) {
+    match refresh_config_inner(&path).await {
+        Ok(()) => {}
+        Err(e) => {
+            error!(error=?e, ?path, "could not read config file");
+        }
+    }
+}
+
+async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
+    let bytes = tokio::fs::read(&path).await?;
+    let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
+
+    let mut settings = None;
+
+    for mapping in data.roles.values_mut() {
+        for jwks in &mut mapping.jwks {
+            ensure!(
+                jwks.jwks_url.has_authority()
+                    && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
+                "Invalid JWKS url. Must be HTTP",
+            );
+
+            ensure!(
+                jwks.jwks_url
+                    .host()
+                    .is_some_and(|h| h != url::Host::Domain("")),
+                "Invalid JWKS url. No domain listed",
+            );
+
+            // clear username, password and ports
+            jwks.jwks_url.set_username("").expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            jwks.jwks_url.set_password(None).expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            // local testing is hard if we need to have a specific restricted port
+            if cfg!(not(feature = "testing")) {
+                jwks.jwks_url.set_port(None).expect(
+                    "url can be a base and has a valid host and is not a file. should not error",
+                );
+            }
+
+            // clear query params
+            jwks.jwks_url.set_fragment(None);
+            jwks.jwks_url.query_pairs_mut().clear().finish();
+
+            if jwks.jwks_url.scheme() != "https" {
+                // local testing is hard if we need to set up https support.
+                if cfg!(not(feature = "testing")) {
+                    jwks.jwks_url
+                        .set_scheme("https")
+                        .expect("should not error to set the scheme to https if it was http");
+                } else {
+                    warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
+                }
+            }
+
+            let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
+            ensure!(
+                *pr == jwks.project_id,
+                "inconsistent project IDs configured"
+            );
+            ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
+        }
+    }
+
+    if let Some((project_id, branch_id)) = settings {
+        JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
+            roles: data.roles,
+            project_id,
+            branch_id,
+        })));
+    }
+
+    Ok(())
+}
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,7 +133,9 @@ async fn main() -> anyhow::Result<()> {
        proxy_listener,
        cancellation_token.clone(),
    ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));
+    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
+        Ok(())
+    }));

    // the signal task cant ever succeed.
    // the main task can error, or can succeed on cancellation.
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -148,7 +148,7 @@ struct ProxyCliArgs {
    disable_dynamic_rate_limiter: bool,
    /// Endpoint rate limiter max number of requests per second.
    ///
-    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
    /// Can be given multiple times for different bucket sizes.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
    endpoint_rps_limit: Vec<RateBucketInfo>,
@@ -173,9 +173,6 @@ struct ProxyCliArgs {
    /// cache for `role_secret` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    role_secret_cache: String,
-    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    disable_ip_check_for_http: bool,
    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
    #[clap(long)]
    redis_notifications: Option<String>,
@@ -450,7 +447,10 @@ async fn main() -> anyhow::Result<()> {

    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
+    maintenance_tasks.spawn(proxy::handle_signals(
+        cancellation_token.clone(),
+        || async { Ok(()) },
+    ));
    maintenance_tasks.spawn(http::health_server::task_main(
        http_listener,
        AppMetrics {
@@ -661,6 +661,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
+        accept_websockets: true,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -24,7 +24,7 @@ impl<C: Cache> Cache for &C {
    type LookupInfo<Key> = C::LookupInfo<Key>;

    fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
-        C::invalidate(self, info)
+        C::invalidate(self, info);
    }
 }

--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -274,13 +274,13 @@ impl ProjectInfoCacheImpl {
        let ttl_disabled_since_us = self
            .ttl_disabled_since_us
            .load(std::sync::atomic::Ordering::Relaxed);
-        let ignore_cache_since = if ttl_disabled_since_us != u64::MAX {
+        let ignore_cache_since = if ttl_disabled_since_us == u64::MAX {
+            None
+        } else {
            let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us);
            // We are fine if entry is not older than ttl or was added before we are getting notifications.
            valid_since = valid_since.min(ignore_cache_since);
            Some(ignore_cache_since)
-        } else {
-            None
        };
        (valid_since, ignore_cache_since)
    }
@@ -306,7 +306,7 @@ impl ProjectInfoCacheImpl {
        let mut removed = 0;
        let shard = self.project2ep.shards()[shard].write();
        for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get().iter() {
+            for endpoint in endpoints.get() {
                self.cache.remove(endpoint);
                removed += 1;
            }
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -58,7 +58,7 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
    type LookupInfo<Key> = LookupInfo<Key>;

    fn invalidate(&self, info: &Self::LookupInfo<K>) {
-        self.invalidate_raw(info)
+        self.invalidate_raw(info);
    }
 }

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -220,7 +220,8 @@ mod tests {

    #[tokio::test]
    async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
+        let handler =
+            CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local);
        handler
            .cancel_session(
                CancelKeyData {
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -44,11 +44,10 @@ pub enum ConnectionError {

 impl UserFacingError for ConnectionError {
    fn to_string_client(&self) -> String {
-        use ConnectionError::*;
        match self {
            // This helps us drop irrelevant library-specific prefixes.
            // TODO: propagate severity level and other parameters.
-            Postgres(err) => match err.as_db_error() {
+            ConnectionError::Postgres(err) => match err.as_db_error() {
                Some(err) => {
                    let msg = err.message();

@@ -62,8 +61,8 @@ impl UserFacingError for ConnectionError {
                }
                None => err.to_string(),
            },
-            WakeComputeError(err) => err.to_string_client(),
-            TooManyConnectionAttempts(_) => {
+            ConnectionError::WakeComputeError(err) => err.to_string_client(),
+            ConnectionError::TooManyConnectionAttempts(_) => {
                "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
            }
            _ => COULD_NOT_CONNECT.to_owned(),
@@ -287,7 +286,7 @@ impl ConnCfg {

        let client_config = if allow_self_signed_compute {
            // Allow all certificates for creating the connection
-            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
+            let verifier = Arc::new(AcceptEverythingVerifier);
            rustls::ClientConfig::builder()
                .dangerous()
                .with_custom_certificate_verifier(verifier)
@@ -366,16 +365,16 @@ static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
 struct AcceptEverythingVerifier;
 impl ServerCertVerifier for AcceptEverythingVerifier {
    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme::*;
+        use rustls::SignatureScheme;
        // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
        vec![
-            ECDSA_NISTP521_SHA512,
-            ECDSA_NISTP384_SHA384,
-            ECDSA_NISTP256_SHA256,
-            RSA_PSS_SHA512,
-            RSA_PSS_SHA384,
-            RSA_PSS_SHA256,
-            ED25519,
+            SignatureScheme::ECDSA_NISTP521_SHA512,
+            SignatureScheme::ECDSA_NISTP384_SHA384,
+            SignatureScheme::ECDSA_NISTP256_SHA256,
+            SignatureScheme::RSA_PSS_SHA512,
+            SignatureScheme::RSA_PSS_SHA384,
+            SignatureScheme::RSA_PSS_SHA256,
+            SignatureScheme::ED25519,
        ]
    }
    fn verify_server_cert(
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -52,6 +52,7 @@ pub struct TlsConfig {
 }

 pub struct HttpConfig {
+    pub accept_websockets: bool,
    pub pool_options: GlobalConnPoolOptions,
    pub cancel_set: CancelSet,
    pub client_conn_threshold: u64,
@@ -155,7 +156,7 @@ pub enum TlsServerEndPoint {
 }

 impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
        let sha256_oids = [
            // I'm explicitly not adding MD5 or SHA1 here... They're bad.
            oid_registry::OID_SIG_ECDSA_WITH_SHA256,
@@ -278,7 +279,7 @@ impl CertResolver {
 impl rustls::server::ResolvesServerCert for CertResolver {
    fn resolve(
        &self,
-        client_hello: rustls::server::ClientHello,
+        client_hello: rustls::server::ClientHello<'_>,
    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
        self.resolve(client_hello.server_name()).map(|x| x.0)
    }
@@ -317,7 +318,7 @@ impl CertResolver {
            // a) Instead of multi-cert approach use single cert with extra
            //    domains listed in Subject Alternative Name (SAN).
            // b) Deploy separate proxy instances for extra domains.
-            self.default.as_ref().cloned()
+            self.default.clone()
        }
    }
 }
@@ -559,7 +560,7 @@ impl RetryConfig {
            match key {
                "num_retries" => num_retries = Some(value.parse()?),
                "base_retry_wait_duration" => {
-                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?);
                }
                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
                unknown => bail!("unknown key: {unknown}"),
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,11 +1,13 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::fmt::{self, Display};

 use crate::auth::IpPattern;

 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::proxy::retry::CouldRetry;
+use crate::RoleName;

 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -22,16 +24,15 @@ impl ConsoleError {
        self.status
            .as_ref()
            .and_then(|s| s.details.error_info.as_ref())
-            .map(|e| e.reason)
-            .unwrap_or(Reason::Unknown)
+            .map_or(Reason::Unknown, |e| e.reason)
    }
+
    pub fn get_user_facing_message(&self) -> String {
        use super::provider::errors::REQUEST_FAILED;
        self.status
            .as_ref()
            .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.clone().into())
-            .unwrap_or_else(|| {
+            .map_or_else(|| {
                // Ask @neondatabase/control-plane for review before adding more.
                match self.http_status_code {
                    http::StatusCode::NOT_FOUND => {
@@ -48,19 +49,18 @@ impl ConsoleError {
                    }
                    _ => REQUEST_FAILED.to_owned(),
                }
-            })
+            }, |m| m.message.clone().into())
    }
 }

 impl Display for ConsoleError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let msg = self
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let msg: &str = self
            .status
            .as_ref()
            .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.as_ref())
-            .unwrap_or_else(|| &self.error);
-        write!(f, "{}", msg)
+            .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref());
+        write!(f, "{msg}")
    }
 }

@@ -286,7 +286,7 @@ pub struct DatabaseInfo {

 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for DatabaseInfo {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("DatabaseInfo")
            .field("host", &self.host)
            .field("port", &self.port)
@@ -343,6 +343,26 @@ impl ColdStartInfo {
    }
 }

+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksRoleMapping {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct EndpointJwksResponse {
+    pub jwks: Vec<JwksSettings>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksSettings {
+    pub id: String,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+    pub jwks_url: url::Url,
+    pub provider_name: String,
+    pub jwt_audience: Option<String>,
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -373,7 +393,7 @@ mod tests {
                }
            }
        });
-        let _: KickSession = serde_json::from_str(&json.to_string())?;
+        let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;

        Ok(())
    }
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -93,7 +93,8 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
 }

 fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
-    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
+    let resp: KickSession<'_> =
+        serde_json::from_str(query).context("Failed to parse query as json")?;

    let span = info_span!("event", session_id = resp.session_id);
    let _enter = span.enter();
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -26,7 +26,7 @@ use tracing::info;
 pub mod errors {
    use crate::{
        console::messages::{self, ConsoleError, Reason},
-        error::{io_error, ReportableError, UserFacingError},
+        error::{io_error, ErrorKind, ReportableError, UserFacingError},
        proxy::retry::CouldRetry,
    };
    use thiserror::Error;
@@ -51,21 +51,19 @@ pub mod errors {
    impl ApiError {
        /// Returns HTTP status code if it's the reason for failure.
        pub fn get_reason(&self) -> messages::Reason {
-            use ApiError::*;
            match self {
-                Console(e) => e.get_reason(),
-                _ => messages::Reason::Unknown,
+                ApiError::Console(e) => e.get_reason(),
+                ApiError::Transport(_) => messages::Reason::Unknown,
            }
        }
    }

    impl UserFacingError for ApiError {
        fn to_string_client(&self) -> String {
-            use ApiError::*;
            match self {
                // To minimize risks, only select errors are forwarded to users.
-                Console(c) => c.get_user_facing_message(),
-                _ => REQUEST_FAILED.to_owned(),
+                ApiError::Console(c) => c.get_user_facing_message(),
+                ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
            }
        }
    }
@@ -73,57 +71,53 @@ pub mod errors {
    impl ReportableError for ApiError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                ApiError::Console(e) => {
-                    use crate::error::ErrorKind::*;
-                    match e.get_reason() {
-                        Reason::RoleProtected => User,
-                        Reason::ResourceNotFound => User,
-                        Reason::ProjectNotFound => User,
-                        Reason::EndpointNotFound => User,
-                        Reason::BranchNotFound => User,
-                        Reason::RateLimitExceeded => ServiceRateLimit,
-                        Reason::NonDefaultBranchComputeTimeExceeded => User,
-                        Reason::ActiveTimeQuotaExceeded => User,
-                        Reason::ComputeTimeQuotaExceeded => User,
-                        Reason::WrittenDataQuotaExceeded => User,
-                        Reason::DataTransferQuotaExceeded => User,
-                        Reason::LogicalSizeQuotaExceeded => User,
-                        Reason::ConcurrencyLimitReached => ControlPlane,
-                        Reason::LockAlreadyTaken => ControlPlane,
-                        Reason::RunningOperations => ControlPlane,
-                        Reason::Unknown => match &e {
-                            ConsoleError {
-                                http_status_code:
-                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                                ..
-                            } => crate::error::ErrorKind::User,
-                            ConsoleError {
-                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                                error,
-                                ..
-                            } if error.contains(
-                                "compute time quota of non-primary branches is exceeded",
-                            ) =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::LOCKED,
-                                error,
-                                ..
-                            } if error.contains("quota exceeded")
-                                || error.contains("the limit for current plan reached") =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                                ..
-                            } => crate::error::ErrorKind::ServiceRateLimit,
-                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
-                        },
-                    }
-                }
+                ApiError::Console(e) => match e.get_reason() {
+                    Reason::RoleProtected => ErrorKind::User,
+                    Reason::ResourceNotFound => ErrorKind::User,
+                    Reason::ProjectNotFound => ErrorKind::User,
+                    Reason::EndpointNotFound => ErrorKind::User,
+                    Reason::BranchNotFound => ErrorKind::User,
+                    Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
+                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User,
+                    Reason::ActiveTimeQuotaExceeded => ErrorKind::User,
+                    Reason::ComputeTimeQuotaExceeded => ErrorKind::User,
+                    Reason::WrittenDataQuotaExceeded => ErrorKind::User,
+                    Reason::DataTransferQuotaExceeded => ErrorKind::User,
+                    Reason::LogicalSizeQuotaExceeded => ErrorKind::User,
+                    Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
+                    Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
+                    Reason::RunningOperations => ErrorKind::ControlPlane,
+                    Reason::Unknown => match &e {
+                        ConsoleError {
+                            http_status_code:
+                                http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                            ..
+                        } => crate::error::ErrorKind::User,
+                        ConsoleError {
+                            http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                            error,
+                            ..
+                        } if error
+                            .contains("compute time quota of non-primary branches is exceeded") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::LOCKED,
+                            error,
+                            ..
+                        } if error.contains("quota exceeded")
+                            || error.contains("the limit for current plan reached") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
+                            ..
+                        } => crate::error::ErrorKind::ServiceRateLimit,
+                        ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
+                    },
+                },
                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
            }
        }
@@ -170,12 +164,11 @@ pub mod errors {

    impl UserFacingError for GetAuthInfoError {
        fn to_string_client(&self) -> String {
-            use GetAuthInfoError::*;
            match self {
                // We absolutely should not leak any secrets!
-                BadSecret => REQUEST_FAILED.to_owned(),
+                Self::BadSecret => REQUEST_FAILED.to_owned(),
                // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
            }
        }
    }
@@ -183,8 +176,8 @@ pub mod errors {
    impl ReportableError for GetAuthInfoError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
-                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+                Self::BadSecret => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
            }
        }
    }
@@ -213,17 +206,16 @@ pub mod errors {

    impl UserFacingError for WakeComputeError {
        fn to_string_client(&self) -> String {
-            use WakeComputeError::*;
            match self {
                // We shouldn't show user the address even if it's broken.
                // Besides, user is unlikely to care about this detail.
-                BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
+                Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),

-                TooManyConnections => self.to_string(),
+                Self::TooManyConnections => self.to_string(),

-                TooManyConnectionAttempts(_) => {
+                Self::TooManyConnectionAttempts(_) => {
                    "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
                }
            }
@@ -233,10 +225,10 @@ pub mod errors {
    impl ReportableError for WakeComputeError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
-                WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
-                WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
+                Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(e) => e.get_error_kind(),
+                Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+                Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
            }
        }
    }
@@ -244,10 +236,10 @@ pub mod errors {
    impl CouldRetry for WakeComputeError {
        fn could_retry(&self) -> bool {
            match self {
-                WakeComputeError::BadComputeAddress(_) => false,
-                WakeComputeError::ApiError(e) => e.could_retry(),
-                WakeComputeError::TooManyConnections => false,
-                WakeComputeError::TooManyConnectionAttempts(_) => false,
+                Self::BadComputeAddress(_) => false,
+                Self::ApiError(e) => e.could_retry(),
+                Self::TooManyConnections => false,
+                Self::TooManyConnectionAttempts(_) => false,
            }
        }
    }
@@ -313,6 +305,7 @@ impl NodeInfo {
        match keys {
            ComputeCredentialKeys::Password(password) => self.config.password(password),
            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+            ComputeCredentialKeys::None => &mut self.config,
        };
    }
 }
@@ -366,13 +359,14 @@ impl Api for ConsoleBackend {
        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
        match self {
-            Console(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(test)]
-            Test(_) => unreachable!("this function should never be called in the test backend"),
+            Self::Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
        }
    }

@@ -381,13 +375,12 @@ impl Api for ConsoleBackend {
        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
        match self {
-            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(test)]
-            Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips_and_secret(),
        }
    }

@@ -396,14 +389,12 @@ impl Api for ConsoleBackend {
        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
-        use ConsoleBackend::*;
-
        match self {
-            Console(api) => api.wake_compute(ctx, user_info).await,
+            Self::Console(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.wake_compute(ctx, user_info).await,
+            Self::Postgres(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(test)]
-            Test(api) => api.wake_compute(),
+            Self::Test(api) => api.wake_compute(),
        }
    }
 }
@@ -549,7 +540,7 @@ impl WakeComputePermit {
        !self.permit.is_disabled()
    }
    pub fn release(self, outcome: Outcome) {
-        self.permit.release(outcome)
+        self.permit.release(outcome);
    }
    pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
        match res {
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -64,7 +64,7 @@ impl Api {
                tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;

            tokio::spawn(connection);
-            let secret = match get_execute_postgres_query(
+            let secret = if let Some(entry) = get_execute_postgres_query(
                &client,
                "select rolpassword from pg_catalog.pg_authid where rolname = $1",
                &[&&*user_info.user],
@@ -72,15 +72,12 @@ impl Api {
            )
            .await?
            {
-                Some(entry) => {
-                    info!("got a secret: {entry}"); // safe since it's not a prod scenario
-                    let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
-                    secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
-                }
-                None => {
-                    warn!("user '{}' does not exist", user_info.user);
-                    None
-                }
+                info!("got a secret: {entry}"); // safe since it's not a prod scenario
+                let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
+                secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
+            } else {
+                warn!("user '{}' does not exist", user_info.user);
+                None
            };
            let allowed_ips = match get_execute_postgres_query(
                &client,
@@ -142,12 +139,11 @@ async fn get_execute_postgres_query(
    let rows = client.query(query, params).await?;

    // We can get at most one row, because `rolname` is unique.
-    let row = match rows.first() {
-        Some(row) => row,
+    let Some(row) = rows.first() else {
        // This means that the user doesn't exist, so there can be no secret.
        // However, this is still a *valid* outcome which is very similar
        // to getting `404 Not found` from the Neon console.
-        None => return Ok(None),
+        return Ok(None);
    };

    let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?;
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,9 +38,9 @@ impl Api {
        locks: &'static ApiLocks<EndpointCacheKey>,
        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
    ) -> Self {
-        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
+        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
            Ok(v) => v,
-            Err(_) => "".to_string(),
+            Err(_) => String::new(),
        };
        Self {
            endpoint,
@@ -96,10 +96,10 @@ impl Api {
                // Error 404 is special: it's ok not to have a secret.
                // TODO(anna): retry
                Err(e) => {
-                    if e.get_reason().is_not_found() {
-                        return Ok(AuthInfo::default());
+                    return if e.get_reason().is_not_found() {
+                        Ok(AuthInfo::default())
                    } else {
-                        return Err(e.into());
+                        Err(e.into())
                    }
                }
            };
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -166,7 +166,7 @@ impl RequestMonitoring {
    pub fn set_project(&self, x: MetricsAuxInfo) {
        let mut this = self.0.try_lock().expect("should not deadlock");
        if this.endpoint_id.is_none() {
-            this.set_endpoint_id(x.endpoint_id.as_str().into())
+            this.set_endpoint_id(x.endpoint_id.as_str().into());
        }
        this.branch = Some(x.branch_id);
        this.project = Some(x.project_id);
@@ -260,7 +260,7 @@ impl RequestMonitoring {
            .cold_start_info
    }

-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
        LatencyTimerPause {
            ctx: self,
            start: tokio::time::Instant::now(),
@@ -273,7 +273,7 @@ impl RequestMonitoring {
            .try_lock()
            .expect("should not deadlock")
            .latency_timer
-            .success()
+            .success();
    }
 }

@@ -328,7 +328,7 @@ impl RequestMonitoringInner {
    fn has_private_peer_addr(&self) -> bool {
        match self.peer_addr {
            IpAddr::V4(ip) => ip.is_private(),
-            _ => false,
+            IpAddr::V6(_) => false,
        }
    }

--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -736,7 +736,7 @@ mod tests {
                while let Some(r) = s.next().await {
                    tx.send(r).unwrap();
                }
-                time::sleep(time::Duration::from_secs(70)).await
+                time::sleep(time::Duration::from_secs(70)).await;
            }
        });

--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -56,7 +56,7 @@ impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
            type Value = InternedString<Id>;

-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                formatter.write_str("a string")
            }

--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,6 +1,83 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
+// rustc lints/lint groups
+// https://doc.rust-lang.org/rustc/lints/groups.html
+#![deny(
+    deprecated,
+    future_incompatible,
+    // TODO: consider let_underscore
+    nonstandard_style,
+    rust_2024_compatibility
+)]
+#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
+// List of denied lints from the clippy::restriction group.
+// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
+#![warn(
+    clippy::undocumented_unsafe_blocks,
+    // TODO: Enable once all individual checks are enabled.
+    //clippy::as_conversions,
+    clippy::dbg_macro,
+    clippy::empty_enum_variants_with_brackets,
+    clippy::exit,
+    clippy::float_cmp_const,
+    clippy::lossy_float_literal,
+    clippy::macro_use_imports,
+    clippy::manual_ok_or,
+    // TODO: consider clippy::map_err_ignore
+    // TODO: consider clippy::mem_forget
+    clippy::rc_mutex,
+    clippy::rest_pat_in_fully_bound_structs,
+    clippy::string_add,
+    clippy::string_to_string,
+    clippy::todo,
+    // TODO: consider clippy::unimplemented
+    // TODO: consider clippy::unwrap_used
+)]
+// List of permanently allowed lints.
+#![allow(
+    // It's ok to cast bool to u8, etc.
+    clippy::cast_lossless,
+    // Seems unavoidable.
+    clippy::multiple_crate_versions,
+    // While #[must_use] is a great feature this check is too noisy.
+    clippy::must_use_candidate,
+    // Inline consts, structs, fns, imports, etc. are ok if they're used by
+    // the following statement(s).
+    clippy::items_after_statements,
+)]
+// List of temporarily allowed lints.
+// TODO: Switch to except() once stable with 1.81.
+// TODO: fix code and reduce list or move to permanent list above.
+#![allow(
+    clippy::cargo_common_metadata,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss,
+    clippy::doc_markdown,
+    clippy::implicit_hasher,
+    clippy::inline_always,
+    clippy::match_same_arms,
+    clippy::match_wild_err_arm,
+    clippy::missing_errors_doc,
+    clippy::missing_panics_doc,
+    clippy::module_name_repetitions,
+    clippy::needless_pass_by_value,
+    clippy::needless_raw_string_hashes,
+    clippy::redundant_closure_for_method_calls,
+    clippy::return_self_not_must_use,
+    clippy::similar_names,
+    clippy::single_match_else,
+    clippy::struct_excessive_bools,
+    clippy::struct_field_names,
+    clippy::too_many_lines,
+    clippy::unreadable_literal,
+    clippy::unused_async,
+    clippy::unused_self,
+    clippy::wildcard_imports
+)]
+// List of temporarily allowed lints to unblock beta/nightly.
+#![allow(unknown_lints, clippy::manual_inspect)]

-use std::convert::Infallible;
+use std::{convert::Infallible, future::Future};

 use anyhow::{bail, Context};
 use intern::{EndpointIdInt, EndpointIdTag, InternId};
@@ -35,7 +112,14 @@ pub mod usage_metrics;
 pub mod waiters;

 /// Handle unix signals appropriately.
-pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {
+pub async fn handle_signals<F, Fut>(
+    token: CancellationToken,
+    mut refresh_config: F,
+) -> anyhow::Result<Infallible>
+where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = anyhow::Result<()>>,
+{
    use tokio::signal::unix::{signal, SignalKind};

    let mut hangup = signal(SignalKind::hangup())?;
@@ -46,7 +130,8 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
        tokio::select! {
            // Hangup is commonly used for config reload.
            _ = hangup.recv() => {
-                warn!("received SIGHUP; config reload is not supported");
+                warn!("received SIGHUP");
+                refresh_config().await?;
            }
            // Shut down the whole application.
            _ = interrupt.recv() => {
--- a/Show More
+++ b/Show More