Merge pull request #6973 from neondatabase/rc/2024-02-29-manual

Release 2024-02-29
libs: fix expired token in auth decode test (#6963 )
2026-05-21 23:20:40 +00:00 · 2024-02-29 17:26:33 +00:00 · 2024-02-29 17:23:25 +00:00 · 2024-02-29 16:39:52 +00:00 · 2024-02-28 15:24:35 +00:00 · 2024-02-28 14:53:35 +00:00
106 changed files with 6479 additions and 1419 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -62,7 +62,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -214,7 +214,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -362,7 +362,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -461,7 +461,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -558,7 +558,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -0,0 +1,105 @@
+name: Build build-tools image
+
+on:
+  workflow_call:
+    inputs:
+      image-tag:
+        description: "build-tools image tag"
+        required: true
+        type: string
+    outputs:
+      image-tag:
+        description: "build-tools tag"
+        value: ${{ inputs.image-tag }}
+      image:
+        description: "build-tools image"
+        value: neondatabase/build-tools:${{ inputs.image-tag }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: build-build-tools-image-${{ inputs.image-tag }}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
+  build-image:
+    needs: [ check-image ]
+    if: needs.check-image.outputs.found == 'false'
+
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - name: Check `input.tag` is correct
+        env:
+          INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
+          CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
+        run: |
+          if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
+            echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
+            exit 1
+          fi
+
+      - uses: actions/checkout@v3
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p /tmp/.docker-custom
+          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/setup-buildx-action@v2
+
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/build-push-action@v4
+        with:
+          context: .
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.build-tools
+          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
+          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
+
+      - name: Remove custom docker config directory
+        run: |
+          rm -rf /tmp/.docker-custom
+
+  merge-images:
+    needs: [ build-image ]
+    runs-on: ubuntu-latest
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
+                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
+                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -1,124 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-    outputs:
-      build-tools-tag:
-        description: "tag generated for build tools"
-        value: ${{ jobs.tag.outputs.build-tools-tag }}
-
-jobs:
-  check-if-build-tools-dockerfile-changed:
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
-            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
-            exit
-          fi
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  tag:
-    runs-on: ubuntu-latest
-    needs: [ check-if-build-tools-dockerfile-changed ]
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        env:
-          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  kaniko:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
-
-  kaniko-arm:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-  manifest:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - tag
-      - kaniko
-      - kaniko-arm
-      - check-if-build-tools-dockerfile-changed
-
-    steps:
-      - name: Create manifest
-        run: |
-          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -77,19 +77,25 @@ jobs:
        shell: bash
        id: build-tag

-  build-buildtools-image:
+  check-build-tools-image:
    needs: [ check-permissions ]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
    with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit

  check-codestyle-python:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -118,10 +124,13 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -185,10 +194,13 @@ jobs:
        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # Raise locked memory limit for tokio-epoll-uring.
      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
      # io_uring will account the memory of the CQ and SQ as locked.
@@ -426,10 +438,13 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
@@ -473,10 +488,13 @@ jobs:
  get-benchmarks-durations:
    outputs:
      json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
@@ -503,10 +521,13 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -538,12 +559,15 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}

    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -584,10 +608,13 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
    strategy:
      fail-fast: false
@@ -691,7 +718,7 @@ jobs:
    secrets: inherit

  neon-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]

    steps:
@@ -726,8 +753,7 @@ jobs:
          build-args: |
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
          provenance: false
          push: true
          pull: true
@@ -743,61 +769,8 @@ jobs:
        run: |
          rm -rf .docker-custom

-  compute-tools-image:
-    runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-tools
-          cache-from: type=registry,ref=neondatabase/compute-tools:cache
-          cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
-          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
  compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]

    strategy:
@@ -837,15 +810,15 @@ jobs:
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - uses: docker/build-push-action@v5
+      - name: Build compute-node image
+        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version }}
-            BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
          provenance: false
          push: true
          pull: true
@@ -856,6 +829,25 @@ jobs:
            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

+      - name: Build compute-tools image
+        # compute-tools are Postgres independent, so build it only once
+        if: ${{ matrix.version == 'v16' }}
+        uses: docker/build-push-action@v5
+        with:
+          target: compute-tools-image
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
      - name: Remove custom docker config directory
        if: always()
        run: |
@@ -903,7 +895,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -937,7 +929,8 @@ jobs:
          fi

      - name: Verify docker-compose example
-        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        timeout-minutes: 20
+        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh

      - name: Print logs and clean up
        if: always()
@@ -1217,3 +1210,11 @@ jobs:

            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
          done
+
+  pin-build-tools-image:
+    needs: [ build-build-tools-image, promote-images, regress-tests ]
+    if: github.ref_name == 'main'
+    uses: ./.github/workflows/pin-build-tools-image.yml
+    with:
+      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
+    secrets: inherit
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -0,0 +1,58 @@
+name: Check build-tools image
+
+on:
+  workflow_call:
+    outputs:
+      image-tag:
+        description: "build-tools image tag"
+        value: ${{ jobs.check-image.outputs.tag }}
+      found:
+        description: "Whether the image is found in the registry"
+        value: ${{ jobs.check-image.outputs.found }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    runs-on: ubuntu-latest
+    outputs:
+      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+      found: ${{ steps.check-image.outputs.found }}
+
+    steps:
+      - name: Get build-tools image tag for the current commit
+        id: get-build-tools-tag
+        env:
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          LAST_BUILD_TOOLS_SHA=$(
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              --method GET \
+              --field path=Dockerfile.build-tools \
+              --field sha=${COMMIT_SHA} \
+              --field per_page=1 \
+              --jq ".[0].sha" \
+              "/repos/${GITHUB_REPOSITORY}/commits"
+          )
+          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
+
+      - name: Check if such tag found in the registry
+        id: check-image
+        env:
+          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+        run: |
+          if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
+            found=true
+          else
+            found=false
+          fi
+
+          echo "found=${found}" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,6 +26,17 @@ jobs:
    with:
      github-event-name: ${{ github.event_name}}

+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
  check-macos-build:
    needs: [ check-permissions ]
    if: |
@@ -123,7 +134,7 @@ jobs:
        run: ./run_clippy.sh

  check-linux-arm-build:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
    runs-on: [ self-hosted, dev, arm64 ]

@@ -137,7 +148,10 @@ jobs:
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -244,12 +258,15 @@ jobs:
          cargo nextest run --package remote_storage --test test_real_azure

  check-codestyle-rust-arm:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
    runs-on: [ self-hosted, dev, arm64 ]

    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -316,14 +333,17 @@ jobs:
        run: cargo deny check

  gather-rust-build-stats:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    env:
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -0,0 +1,72 @@
+name: 'Pin build-tools image'
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+  workflow_call:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: pin-build-tools-image-${{ inputs.from-tag }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: ubuntu-latest
+
+    env:
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: pinned
+
+    steps:
+      - name: Check if we really need to pin the image
+        id: check-manifests
+        run: |
+          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
+          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
+
+          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
+            skip=true
+          else
+            skip=false
+          fi
+
+          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -1,70 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-
-    env:
-      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v2
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install crane
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
-
-      - name: Copy images
-        run: |
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ test_output/
 neon.iml
 /.neon
 /integration_tests/.neon
+compaction-suite-results.*

 # Coverage
 *.profraw
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -74,16 +74,11 @@ We're using the following approach to make it work:

 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)

-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+## How do I make build-tools image "pinned"

-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
+It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.

 ```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
+gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
+            -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
+```
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3498,6 +3498,7 @@ dependencies = [
 "num_cpus",
 "once_cell",
 "pageserver_api",
+ "pageserver_compaction",
 "pin-project-lite",
 "postgres",
 "postgres-protocol",
@@ -3588,6 +3589,53 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_compaction"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-compression",
+ "async-stream",
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "chrono",
+ "clap",
+ "const_format",
+ "consumption_metrics",
+ "criterion",
+ "crossbeam-utils",
+ "either",
+ "fail",
+ "flate2",
+ "futures",
+ "git-version",
+ "hex",
+ "hex-literal",
+ "humantime",
+ "humantime-serde",
+ "itertools",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pin-project-lite",
+ "rand 0.8.5",
+ "smallvec",
+ "svg_fmt",
+ "sync_wrapper",
+ "thiserror",
+ "tokio",
+ "tokio-io-timeout",
+ "tokio-util",
+ "tracing",
+ "tracing-error",
+ "tracing-subscriber",
+ "url",
+ "utils",
+ "walkdir",
+ "workspace_hack",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
    "control_plane",
    "control_plane/attachment_service",
    "pageserver",
+    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
@@ -199,6 +200,7 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -891,7 +891,17 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
+RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+
+#########################################################################################
+#
+# Final compute-tools image
+#
+#########################################################################################
+
+FROM debian:bullseye-slim AS compute-tools-image
+
+COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

 #########################################################################################
 #
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,32 +0,0 @@
-# First transient image to build compute_tools binaries
-# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
-ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
-ARG TAG=pinned
-ARG BUILD_TAG
-
-FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
-WORKDIR /home/nonroot
-
-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-COPY . .
-
-RUN set -e \
-    && mold -run cargo build -p compute_tools --locked --release \
-    && cachepot -s
-
-# Final image that only has one binary
-FROM debian:bullseye-slim
-
-COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

 ## Quick start
-Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.

 Alternatively, compile and run the project [locally](#running-local-installation).

@@ -267,7 +267,7 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th

 For cleaning up the source tree from build artifacts, run `make clean` in the source directory.

-For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directorz will remove your database, with all data in it. You have been warned!
+For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!

 ## Documentation

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -676,8 +676,15 @@ pub fn handle_grants(
                            GRANT CREATE ON SCHEMA public TO web_access;\n\
                        END IF;\n\
                    END IF;\n\
-                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
-                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    )\n\
+                    THEN\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
+                    END IF;\n\
                END\n\
            $$;"
        .to_string();
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
@@ -0,0 +1,9 @@
+use utils::auth::{AuthError, Claims, Scope};
+
+pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
+    if claims.scope != required_scope {
+        return Err(AuthError("Scope mismatch. Permission denied".into()));
+    }
+
+    Ok(())
+}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,8 +10,8 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use utils::auth::SwappableJwtAuth;
-use utils::http::endpoint::{auth_middleware, request_span};
+use utils::auth::{Scope, SwappableJwtAuth};
+use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

@@ -25,12 +25,12 @@ use utils::{
    id::NodeId,
 };

-use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
-
-use control_plane::attachment_service::{
-    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
-    TenantShardMigrateRequest,
+use pageserver_api::controller_api::{
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
+use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
+
+use control_plane::attachment_service::{AttachHookRequest, InspectRequest};

 /// State available to HTTP request handlers
 #[derive(Clone)]
@@ -64,6 +64,8 @@ fn get_state(request: &Request<Body>) -> &HttpState {

 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -72,6 +74,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -81,6 +85,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
 async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
    let state = get_state(&req);

@@ -95,6 +101,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 }

 async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let inspect_req = json_request::<InspectRequest>(&mut req).await?;

    let state = get_state(&req);
@@ -106,6 +114,8 @@ async fn handle_tenant_create(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
    json_response(
        StatusCode::CREATED,
@@ -164,6 +174,8 @@ async fn handle_tenant_location_config(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
    json_response(
        StatusCode::OK,
@@ -178,6 +190,8 @@ async fn handle_tenant_time_travel_remote_storage(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;

    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -211,6 +225,7 @@ async fn handle_tenant_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;

    deletion_wrapper(service, move |service| async move {
        service.tenant_delete(tenant_id).await
@@ -223,6 +238,8 @@ async fn handle_tenant_timeline_create(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
    json_response(
        StatusCode::CREATED,
@@ -237,6 +254,8 @@ async fn handle_tenant_timeline_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    deletion_wrapper(service, move |service| async move {
@@ -250,6 +269,7 @@ async fn handle_tenant_timeline_passthrough(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;

    let Some(path) = req.uri().path_and_query() else {
        // This should never happen, our request router only calls us if there is a path
@@ -293,11 +313,15 @@ async fn handle_tenant_locate(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
    let state = get_state(&req);
    state.service.node_register(register_req).await?;
@@ -305,17 +329,23 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 }

 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }

 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
    if node_id != config_req.node_id {
@@ -335,6 +365,8 @@ async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;

@@ -348,6 +380,8 @@ async fn handle_tenant_shard_migrate(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
    json_response(
@@ -360,22 +394,30 @@ async fn handle_tenant_shard_migrate(

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }

 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    state.service.tenants_dump()
 }

 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    state.service.scheduler_dump()
 }

 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -432,6 +474,12 @@ where
    .await
 }

+fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
+    check_permission_with(request, |claims| {
+        crate::auth::check_permission(claims, required_scope)
+    })
+}
+
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,6 +1,7 @@
 use serde::{Deserialize, Serialize};
 use utils::seqwait::MonotonicCounter;

+mod auth;
 mod compute_hook;
 pub mod http;
 pub mod metrics;
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,4 +1,4 @@
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
 use serde::Serialize;
 use utils::id::NodeId;

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,10 +6,10 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -255,7 +255,7 @@ impl Scheduler {
 pub(crate) mod test_utils {

    use crate::node::Node;
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -9,19 +9,17 @@ use std::{

 use anyhow::Context;
 use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
-    TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+    TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+    TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+};
 use pageserver_api::{
-    control_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateResponse, ValidateResponseTenant,
-    },
    models::{
        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
@@ -29,6 +27,10 @@ use pageserver_api::{
        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
+    upcall_api::{
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateResponse, ValidateResponseTenant,
+    },
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,7 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

 use crate::{metrics, persistence::TenantShardPersistence};
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,8 +2,12 @@ use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
+    controller_api::{
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
    models::{
-        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
        TimelineCreateRequest, TimelineInfo,
    },
    shard::TenantShardId,
@@ -11,12 +15,12 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::str::FromStr;
+use std::{fs, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
 use utils::{
-    auth::{Claims, Scope},
+    auth::{encode_from_key_file, Claims, Scope},
    id::{NodeId, TenantId},
 };

@@ -24,7 +28,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: Utf8PathBuf,
-    jwt_token: Option<String>,
+    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
@@ -55,126 +59,6 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-    pub generation: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponse {
-    pub shards: Vec<TenantCreateResponseShard>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeRegisterRequest {
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeConfigureRequest {
-    pub node_id: NodeId,
-
-    pub availability: Option<NodeAvailability>,
-    pub scheduling: Option<NodeSchedulingPolicy>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantLocateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantLocateResponse {
-    pub shards: Vec<TenantLocateResponseShard>,
-    pub shard_params: ShardParameters,
-}
-
-/// Explicitly migrating a particular shard is a low level operation
-/// TODO: higher level "Reschedule tenant" operation where the request
-/// specifies some constraints, e.g. asking it to get off particular node(s)
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
-    pub node_id: NodeId,
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeAvailability {
-    // Normal, happy state
-    Active,
-    // Offline: Tenants shouldn't try to attach here, but they may assume that their
-    // secondary locations on this node still exist.  Newly added nodes are in this
-    // state until we successfully contact them.
-    Offline,
-}
-
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeSchedulingPolicy {
-    Active,
-    Filling,
-    Pause,
-    Draining,
-}
-
-impl FromStr for NodeSchedulingPolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "filling" => Ok(Self::Filling),
-            "pause" => Ok(Self::Pause),
-            "draining" => Ok(Self::Draining),
-            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
-        }
-    }
-}
-
-impl From<NodeSchedulingPolicy> for String {
-    fn from(value: NodeSchedulingPolicy) -> String {
-        use NodeSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Filling => "filling",
-            Pause => "pause",
-            Draining => "draining",
-        }
-        .to_string()
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateResponse {}
-
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
@@ -204,12 +88,11 @@ impl AttachmentService {
            .pageservers
            .first()
            .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key) = match ps_conf.http_auth_type {
+        let (private_key, public_key) = match ps_conf.http_auth_type {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
-                let jwt_token = env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
-                    .unwrap();
+                let private_key_path = env.get_private_key_path();
+                let private_key = fs::read(private_key_path).expect("failed to read private key");

                // If pageserver auth is enabled, this implicitly enables auth for this service,
                // using the same credentials.
@@ -235,7 +118,7 @@ impl AttachmentService {
                } else {
                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
                };
-                (Some(jwt_token), Some(public_key))
+                (Some(private_key), Some(public_key))
            }
        };

@@ -243,7 +126,7 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
-            jwt_token,
+            private_key,
            public_key,
            postgres_port,
            client: reqwest::ClientBuilder::new()
@@ -397,7 +280,10 @@ impl AttachmentService {
        .into_iter()
        .map(|s| s.to_string())
        .collect::<Vec<_>>();
-        if let Some(jwt_token) = &self.jwt_token {
+        if let Some(private_key) = &self.private_key {
+            let claims = Claims::new(None, Scope::PageServerApi);
+            let jwt_token =
+                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
            args.push(format!("--jwt-token={jwt_token}"));
        }

@@ -422,7 +308,7 @@ impl AttachmentService {
            )],
            background_process::InitialPidFile::Create(self.pid_file()),
            || async {
-                match self.status().await {
+                match self.ready().await {
                    Ok(_) => Ok(true),
                    Err(_) => Ok(false),
                }
@@ -468,6 +354,20 @@ impl AttachmentService {
        Ok(())
    }

+    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
+        let category = match path.find('/') {
+            Some(idx) => &path[..idx],
+            None => path,
+        };
+
+        match category {
+            "status" | "ready" => Ok(None),
+            "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
+            "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
+            _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
+        }
+    }
+
    /// Simple HTTP request wrapper for calling into attachment service
    async fn dispatch<RQ, RS>(
        &self,
@@ -493,11 +393,16 @@ impl AttachmentService {
        if let Some(body) = body {
            builder = builder.json(&body)
        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
+        if let Some(private_key) = &self.private_key {
+            println!("Getting claims for path {}", path);
+            if let Some(required_claims) = Self::get_claims_for_path(&path)? {
+                println!("Got claims {:?} for path {}", required_claims, path);
+                let jwt_token = encode_from_key_file(&required_claims, private_key)?;
+                builder = builder.header(
+                    reqwest::header::AUTHORIZATION,
+                    format!("Bearer {jwt_token}"),
+                );
+            }
        }

        let response = builder.send().await?;
@@ -617,8 +522,8 @@ impl AttachmentService {
    }

    #[instrument(skip(self))]
-    pub async fn status(&self) -> anyhow::Result<()> {
-        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
+    pub async fn ready(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
            .await
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,14 +8,15 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::{
-    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
-};
+use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -412,14 +412,17 @@ impl LocalEnv {

    // this function is used only for testing purposes in CLI e g generate tokens during init
    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
-        let private_key_path = if self.private_key_path.is_absolute() {
+        let private_key_path = self.get_private_key_path();
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    pub fn get_private_key_path(&self) -> PathBuf {
+        if self.private_key_path.is_absolute() {
            self.private_key_path.to_path_buf()
        } else {
            self.base_data_dir.join(&self.private_key_path)
-        };
-
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
+        }
    }

    //
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,6 +17,7 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
+use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -30,7 +31,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
+use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

@@ -115,7 +116,7 @@ impl PageServerNode {
            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
                    .unwrap();
                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
            }
@@ -352,6 +353,11 @@ impl PageServerNode {
                .remove("compaction_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
+            compaction_algorithm: settings
+                .remove("compaction_algorithm")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Failed to parse 'compaction_algorithm' json")?,
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
@@ -455,6 +461,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'compaction_threshold' as an integer")?,
+                compaction_algorithm: settings
+                    .remove("compactin_algorithm")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("Failed to parse 'compaction_algorithm' json")?,
                gc_horizon: settings
                    .remove("gc_horizon")
                    .map(|x| x.parse::<u64>())
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.

+"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+
+"admin": Provides access to the control plane and admin APIs of the attachment service.

 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -0,0 +1,129 @@
+use std::str::FromStr;
+
+/// Request/response types for the storage controller
+/// API (`/control/v1` prefix).  Implemented by the server
+/// in [`attachment_service::http`]
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+use crate::{models::ShardParameters, shard::TenantShardId};
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,6 +307,7 @@ impl KeySpaceRandomAccum {
    }
 }

+#[inline(always)]
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
    let start = key_range.start;
    let end = key_range.end;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,13 +2,14 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

-/// Public API types
-pub mod control_api;
+pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
+/// Public API types
+pub mod upcall_api;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,6 +272,8 @@ pub struct TenantConfig {
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -306,6 +308,13 @@ impl EvictionPolicy {
    }
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum CompactionAlgorithm {
+    Legacy,
+    Tiered,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -32,6 +32,8 @@ pub enum Scope {
    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
+    // Allows access to control plane managment API and some storage controller endpoints.
+    Admin,
 }

 /// JWT payload. See docs/authentication.md for the format
@@ -204,12 +206,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        //   "scope": "tenant",
        //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
        //   "iss": "neon.controlplane",
-        //   "exp": 1709200879,
        //   "iat": 1678442479
        // }
        // ```
        //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";

        // Check it can be validated with the public key
        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -73,6 +73,7 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
+pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -0,0 +1,54 @@
+[package]
+name = "pageserver_compaction"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[features]
+default = []
+
+[dependencies]
+anyhow.workspace = true
+async-compression.workspace = true
+async-stream.workspace = true
+async-trait.workspace = true
+byteorder.workspace = true
+bytes.workspace = true
+chrono = { workspace = true, features = ["serde"] }
+clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
+consumption_metrics.workspace = true
+crossbeam-utils.workspace = true
+either.workspace = true
+flate2.workspace = true
+fail.workspace = true
+futures.workspace = true
+git-version.workspace = true
+hex.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
+itertools.workspace = true
+once_cell.workspace = true
+pageserver_api.workspace = true
+pin-project-lite.workspace = true
+rand.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+svg_fmt.workspace = true
+sync_wrapper.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-error.workspace = true
+tracing-subscriber.workspace = true
+url.workspace = true
+walkdir.workspace = true
+metrics.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+hex-literal.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
--- a/pageserver/compaction/TODO.md
+++ b/pageserver/compaction/TODO.md
@@ -0,0 +1,51 @@
+# TODO
+
+- If the key space can be perfectly partitioned at some key, perform planning on each
+  partition separately. For example, if we are compacting a level with layers like this:
+
+  ```
+              :
+  +--+ +----+ :  +------+
+  |  | |    | :  |      |
+  +--+ +----+ :  +------+
+              :
+  +-----+ +-+ : +--------+
+  |     | | | : |        |
+  +-----+ +-+ : +--------+
+              :
+  ```
+
+  At the dotted line, there is a natural split in the key space, such that all
+  layers are either on the left or the right of it. We can compact the
+  partitions separately.  We could choose to create image layers for one
+  partition but not the other one, for example.
+
+- All the layers don't have to be exactly the same size, we can choose to cut a
+  layer short or stretch it a little larger than the target size, if it helps
+  the overall system. We can help perfect partitions (see previous bullet point)
+  to happen more frequently, by choosing the cut points wisely. For example, try
+  to cut layers at boundaries of underlying image layers. And "snap to grid",
+  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
+
+- Avoid rewriting layers when we'd just create an identical layer to an input
+  layer.
+
+- Parallelism. The code is already split up into planning and execution, so that
+  we first split up the compaction work into "Jobs", and then execute them.
+  It would be straightforward to execute multiple jobs in parallel.
+
+- Materialize extra pages in delta layers during compaction. This would reduce
+  read amplification. There has been the idea of partial image layers. Materializing
+  extra pages in the delta layers achieve the same goal, without introducing a new
+  concept.
+
+## Simulator
+
+- Expand the simulator for more workloads
+- Automate a test suite that runs the simluator with different workloads and
+  spits out a table of results
+- Model read amplification
+- More sanity checking. One idea is to keep a reference count of each
+  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
+  a MockRecord that is newer than PITR horizon is completely dropped. That would
+  indicate that the record was lost.
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -0,0 +1,214 @@
+use clap::{Parser, Subcommand};
+use pageserver_compaction::simulator::MockTimeline;
+use rand::Rng;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::sync::OnceLock;
+
+use utils::project_git_version;
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver compaction simulator",
+    long_about = "A developer tool to visualize and test compaction"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    RunSuite,
+    Simulate(SimulateCmd),
+}
+
+#[derive(Clone, clap::ValueEnum)]
+enum Distribution {
+    Uniform,
+    HotCold,
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct SimulateCmd {
+    distribution: Distribution,
+
+    /// Number of records to digest
+    num_records: u64,
+    /// Record length
+    record_len: u64,
+
+    // Logical database size in MB
+    logical_size: u64,
+}
+
+async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
+    let mut executor = MockTimeline::new();
+
+    // Convert the logical size in MB into a key range.
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    //let key_range = u64::MIN..u64::MAX;
+    println!(
+        "starting simulation with key range {:016X}-{:016X}",
+        key_range.start, key_range.end
+    );
+
+    // helper function to print progress indicator
+    let print_progress = |i| -> anyhow::Result<()> {
+        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
+            print!(
+                "\ringested {} / {} records, {} MiB / {} MiB...",
+                i + 1,
+                cmd.num_records,
+                (i + 1) * cmd.record_len / (1_000_000),
+                cmd.num_records * cmd.record_len / (1_000_000),
+            );
+            std::io::stdout().flush()?;
+        }
+        Ok(())
+    };
+
+    match cmd.distribution {
+        Distribution::Uniform => {
+            for i in 0..cmd.num_records {
+                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+        Distribution::HotCold => {
+            let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
+            let hot_key_range = 0..splitpoint;
+            let cold_key_range = splitpoint..key_range.end;
+
+            for i in 0..cmd.num_records {
+                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
+                    &hot_key_range
+                } else {
+                    &cold_key_range
+                };
+                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+    }
+    println!("done!");
+    executor.flush_l0();
+    executor.compact_if_needed().await?;
+    let stats = executor.stats()?;
+
+    // Print the stats to stdout, and also to a file
+    print!("{stats}");
+    std::fs::write(results_path.join("stats.txt"), stats)?;
+
+    let animation_path = results_path.join("compaction-animation.html");
+    executor.draw_history(std::fs::File::create(&animation_path)?)?;
+    println!(
+        "animation: file://{}",
+        animation_path.canonicalize()?.display()
+    );
+
+    Ok(())
+}
+
+async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
+    std::fs::create_dir(results_path)?;
+
+    set_log_file(File::create(results_path.join("log"))?);
+    let result = simulate(workload, results_path).await;
+    set_log_stdout();
+    result
+}
+
+async fn run_suite() -> anyhow::Result<()> {
+    let top_results_path = PathBuf::from(format!(
+        "compaction-suite-results.{}",
+        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
+    ));
+    std::fs::create_dir(&top_results_path)?;
+
+    let workload = SimulateCmd {
+        distribution: Distribution::Uniform,
+        // Generate 20 GB of WAL
+        record_len: 1_000,
+        num_records: 20_000_000,
+        // Logical size 5 GB
+        logical_size: 5_000,
+    };
+
+    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
+
+    println!(
+        "All tests finished. Results in {}",
+        top_results_path.display()
+    );
+    Ok(())
+}
+
+use std::fs::File;
+use std::io::Stdout;
+use std::sync::Mutex;
+use tracing_subscriber::fmt::writer::EitherWriter;
+use tracing_subscriber::fmt::MakeWriter;
+
+static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
+fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
+    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
+}
+
+fn set_log_file(f: File) {
+    *get_log_output().lock().unwrap() = EitherWriter::A(f);
+}
+
+fn set_log_stdout() {
+    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
+}
+
+fn init_logging() -> anyhow::Result<()> {
+    // We fall back to printing all spans at info-level or above if
+    // the RUST_LOG environment variable is not set.
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };
+
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    tracing_subscriber::registry()
+        .with({
+            let log_layer = tracing_subscriber::fmt::layer()
+                .with_target(false)
+                .with_ansi(false)
+                .with_writer(|| get_log_output().make_writer());
+            log_layer.with_filter(rust_log_env_filter())
+        })
+        .init();
+
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    init_logging()?;
+
+    match cli.command {
+        Commands::Simulate(cmd) => {
+            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
+        }
+        Commands::RunSuite => {
+            run_suite().await?;
+        }
+    };
+    Ok(())
+}
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -0,0 +1,866 @@
+//! # Tiered compaction algorithm.
+//!
+//! Read all the input delta files, and write a new set of delta files that
+//! include all the input WAL records. See retile_deltas().
+//!
+//! In a "normal" LSM tree, you get to remove any values that are overwritten by
+//! later values, but in our system, we keep all the history. So the reshuffling
+//! doesn't remove any garbage, it just reshuffles the records to reduce read
+//! amplification, i.e. the number of files that you need to access to find the
+//! WAL records for a given key.
+//!
+//! If the new delta files would be very "narrow", i.e. each file would cover
+//! only a narrow key range, then we create a new set of image files
+//! instead. The current threshold is that if the estimated total size of the
+//! image layers is smaller than the size of the deltas, then we create image
+//! layers. That amounts to 2x storage amplification, and it means that the
+//! distance of image layers in LSN dimension is roughly equal to the logical
+//! database size. For example, if the logical database size is 10 GB, we would
+//! generate new image layers every 10 GB of WAL.
+use futures::StreamExt;
+use tracing::{debug, info};
+
+use std::collections::{HashSet, VecDeque};
+use std::ops::Range;
+
+use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::interface::*;
+use utils::lsn::Lsn;
+
+use crate::identify_levels::identify_level;
+
+/// Main entry point to compaction.
+///
+/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
+/// everything below that point, that needs compaction. The cutoff LSN must
+/// partition the layers so that there are no layers that span across that
+/// LSN. To start compaction at the top of the tree, pass the end LSN of the
+/// written last L0 layer.
+pub async fn compact_tiered<E: CompactionJobExecutor>(
+    executor: &mut E,
+    end_lsn: Lsn,
+    target_file_size: u64,
+    fanout: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<()> {
+    assert!(fanout >= 2);
+    // Start at L0
+    let mut current_level_no = 0;
+    let mut current_level_target_height = target_file_size;
+    loop {
+        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
+        let all_layers = executor
+            .get_layers(
+                &(E::Key::MIN..E::Key::MAX),
+                &(Lsn(u64::MIN)..end_lsn + 1),
+                ctx,
+            )
+            .await?;
+        info!(
+            "Compacting L{}, total # of layers: {}",
+            current_level_no,
+            all_layers.len()
+        );
+
+        // Identify the range of LSNs that belong to this level. We assume that
+        // each file in this level span an LSN range up to 1.75x target file
+        // size. That should give us enough slop that if we created a slightly
+        // oversized L0 layer, e.g. because flushing the in-memory layer was
+        // delayed for some reason, we don't consider the oversized layer to
+        // belong to L1. But not too much slop, that we don't accidentally
+        // "skip" levels.
+        let max_height = (current_level_target_height as f64 * 1.75) as u64;
+        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
+            break;
+        };
+
+        // Calculate the height of this level. If the # of tiers exceeds the
+        // fanout parameter, it's time to compact it.
+        let depth = level.depth();
+        info!(
+            "Level {} identified as LSN range {}-{}: depth {}",
+            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
+        );
+        for l in &level.layers {
+            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
+        }
+        if depth < fanout {
+            debug!(
+                level = current_level_no,
+                depth = depth,
+                fanout,
+                "too few deltas to compact"
+            );
+            break;
+        }
+
+        compact_level(
+            &level.lsn_range,
+            &level.layers,
+            executor,
+            target_file_size,
+            ctx,
+        )
+        .await?;
+        if target_file_size == u64::MAX {
+            break;
+        }
+        current_level_no += 1;
+        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+    }
+    Ok(())
+}
+
+async fn compact_level<E: CompactionJobExecutor>(
+    lsn_range: &Range<Lsn>,
+    layers: &[E::Layer],
+    executor: &mut E,
+    target_file_size: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<bool> {
+    let mut layer_fragments = Vec::new();
+    for l in layers {
+        layer_fragments.push(LayerFragment::new(l.clone()));
+    }
+
+    let mut state = LevelCompactionState {
+        target_file_size,
+        _lsn_range: lsn_range.clone(),
+        layers: layer_fragments,
+        jobs: Vec::new(),
+        job_queue: Vec::new(),
+        next_level: false,
+        executor,
+    };
+
+    let first_job = CompactionJob {
+        key_range: E::Key::MIN..E::Key::MAX,
+        lsn_range: lsn_range.clone(),
+        strategy: CompactionStrategy::Divide,
+        input_layers: state
+            .layers
+            .iter()
+            .enumerate()
+            .map(|i| LayerId(i.0))
+            .collect(),
+        completed: false,
+    };
+
+    state.jobs.push(first_job);
+    state.job_queue.push(JobId(0));
+    state.execute(ctx).await?;
+
+    info!(
+        "compaction completed! Need to process next level: {}",
+        state.next_level
+    );
+
+    Ok(state.next_level)
+}
+
+/// Blackboard that keeps track of the state of all the jobs and work remaining
+struct LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    // parameters
+    target_file_size: u64,
+
+    _lsn_range: Range<Lsn>,
+    layers: Vec<LayerFragment<E>>,
+
+    // job queue
+    jobs: Vec<CompactionJob<E>>,
+    job_queue: Vec<JobId>,
+
+    /// If false, no need to compact levels below this
+    next_level: bool,
+
+    /// Interface to the outside world
+    executor: &'a mut E,
+}
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct LayerId(usize);
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct JobId(usize);
+
+struct PendingJobSet {
+    pending: HashSet<JobId>,
+    completed: HashSet<JobId>,
+}
+
+impl PendingJobSet {
+    fn new() -> Self {
+        PendingJobSet {
+            pending: HashSet::new(),
+            completed: HashSet::new(),
+        }
+    }
+
+    fn complete_job(&mut self, job_id: JobId) {
+        self.pending.remove(&job_id);
+        self.completed.insert(job_id);
+    }
+
+    fn all_completed(&self) -> bool {
+        self.pending.is_empty()
+    }
+}
+
+// When we decide to rewrite a set of layers, LayerFragment is used to keep
+// track which new layers supersede an old layer. When all the stakeholder jobs
+// have completed, this layer can be deleted.
+struct LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    layer: E::Layer,
+
+    // If we will write new layers to replace this one, this keeps track of the
+    // jobs that need to complete before this layer can be deleted. As the jobs
+    // complete, they are moved from 'pending' to 'completed' set. Once the
+    // 'pending' set becomes empty, the layer can be deleted.
+    //
+    // If None, this layer is not rewritten and must not be deleted.
+    deletable_after: Option<PendingJobSet>,
+
+    deleted: bool,
+}
+
+impl<E> LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    fn new(layer: E::Layer) -> Self {
+        LayerFragment {
+            layer,
+            deletable_after: None,
+            deleted: false,
+        }
+    }
+}
+
+#[derive(PartialEq)]
+enum CompactionStrategy {
+    Divide,
+    CreateDelta,
+    CreateImage,
+}
+
+#[allow(dead_code)] // Todo
+struct CompactionJob<E: CompactionJobExecutor> {
+    key_range: Range<E::Key>,
+    lsn_range: Range<Lsn>,
+
+    strategy: CompactionStrategy,
+
+    input_layers: Vec<LayerId>,
+
+    completed: bool,
+}
+
+impl<'a, E> LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    /// Main loop of the executor.
+    ///
+    /// In each iteration, we take the next job from the queue, and execute it.
+    /// The execution might add new jobs to the queue. Keep going until the
+    /// queue is empty.
+    ///
+    /// Initially, the job queue consists of one Divide job over the whole
+    /// level. On first call, it is divided into smaller jobs.
+    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
+        while let Some(next_job_id) = self.job_queue.pop() {
+            info!("executing job {}", next_job_id.0);
+            self.execute_job(next_job_id, ctx).await?;
+        }
+
+        // all done!
+        Ok(())
+    }
+
+    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        match job.strategy {
+            CompactionStrategy::Divide => {
+                self.divide_job(job_id, ctx).await?;
+                Ok(())
+            }
+            CompactionStrategy::CreateDelta => {
+                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+                let mut layer_ids: Vec<LayerId> = Vec::new();
+                for layer_id in &job.input_layers {
+                    let layer = &self.layers[layer_id.0].layer;
+                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
+                        deltas.push(dl.clone());
+                        layer_ids.push(*layer_id);
+                    }
+                }
+
+                self.executor
+                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // did we complete any fragments?
+                for layer_id in layer_ids {
+                    let l = &mut self.layers[layer_id.0];
+                    if let Some(deletable_after) = l.deletable_after.as_mut() {
+                        deletable_after.complete_job(job_id);
+                        if deletable_after.all_completed() {
+                            self.executor.delete_layer(&l.layer, ctx).await?;
+                            l.deleted = true;
+                        }
+                    }
+                }
+
+                self.next_level = true;
+
+                Ok(())
+            }
+            CompactionStrategy::CreateImage => {
+                self.executor
+                    .create_image(job.lsn_range.end, &job.key_range, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // TODO: we could check if any layers < PITR horizon became deletable
+                Ok(())
+            }
+        }
+    }
+
+    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
+        let job_id = JobId(self.jobs.len());
+        self.jobs.push(job);
+        self.job_queue.push(job_id);
+        job_id
+    }
+
+    /// Take a partition of the key space, and decide how to compact it.
+    ///
+    /// TODO: Currently, this is called exactly once for the level, and we
+    /// decide whether to create new image layers to cover the whole level, or
+    /// write a new set of delta. In the future, this should try to partition
+    /// the key space, and make the decision separately for each partition.
+    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Check for dummy cases
+        if job.input_layers.is_empty() {
+            return Ok(());
+        }
+
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Would it be better to create images for this partition?
+        // Decide based on the average density of the level
+        let keyspace_size = keyspace_total_size(
+            &self
+                .executor
+                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+                .await?,
+        ) * 8192;
+
+        let wal_size = job
+            .input_layers
+            .iter()
+            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
+            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
+            .sum::<u64>();
+        if keyspace_size < wal_size {
+            // seems worth it
+            info!(
+                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
+                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
+            );
+            self.cover_with_images(job_id, ctx).await
+        } else {
+            // do deltas
+            info!(
+                "coverage not worth it, keyspace_size {}, wal_size {}",
+                keyspace_size, wal_size
+            );
+            self.retile_deltas(job_id, ctx).await
+        }
+    }
+
+    // LSN
+    //  ^
+    //  |
+    //  |                          ###|###|#####
+    //  | +--+-----+--+            +--+-----+--+
+    //  | |  |     |  |            |  |     |  |
+    //  | +--+--+--+--+            +--+--+--+--+
+    //  | |     |     |            |     |     |
+    //  | +---+-+-+---+     ==>    +---+-+-+---+
+    //  | |   |   |   |            |   |   |   |
+    //  | +---+-+-++--+            +---+-+-++--+
+    //  | |     |  |  |            |     |  |  |
+    //  | +-----+--+--+            +-----+--+--+
+    //  |
+    //  +--------------> key
+    //
+    async fn cover_with_images(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // XXX: do we still need the "holes" stuff?
+
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let keyspace = self
+            .executor
+            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+            .await?;
+
+        let mut window = KeyspaceWindow::new(
+            E::Key::MIN..E::Key::MAX,
+            keyspace,
+            self.target_file_size / 8192,
+        );
+        while let Some(key_range) = window.choose_next_image() {
+            new_jobs.push(CompactionJob::<E> {
+                key_range,
+                lsn_range: job.lsn_range.clone(),
+                strategy: CompactionStrategy::CreateImage,
+                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
+                completed: false,
+            });
+        }
+
+        for j in new_jobs.into_iter().rev() {
+            let _job_id = self.push_job(j);
+
+            // TODO: image layers don't let us delete anything. unless < PITR horizon
+            //let j = &self.jobs[job_id.0];
+            // for layer_id in j.input_layers.iter() {
+            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
+            //}
+        }
+
+        Ok(())
+    }
+
+    // Merge the contents of all the input delta layers into a new set
+    // of delta layers, based on the current partitioning.
+    //
+    // We split the new delta layers on the key dimension. We iterate through
+    // the key space, and for each key, check if including the next key to the
+    // current output layer we're building would cause the layer to become too
+    // large. If so, dump the current output layer and start new one.  It's
+    // possible that there is a single key with so many page versions that
+    // storing all of them in a single layer file would be too large. In that
+    // case, we also split on the LSN dimension.
+    //
+    // LSN
+    //  ^
+    //  |
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    //
+    // If one key (X) has a lot of page versions:
+    //
+    // LSN
+    //  ^
+    //  |                                 (X)
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  +--+  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  +--+  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    // TODO: this actually divides the layers into fixed-size chunks, not
+    // based on the partitioning.
+    //
+    // TODO: we should also opportunistically materialize and
+    // garbage collect what we can.
+    async fn retile_deltas(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Sweep the key space left to right, running an estimate of how much
+        // disk size and keyspace we have accumulated
+        //
+        // Once the disk size reaches the target threshold, stop and think.
+        // If we have accumulated only a narrow band of keyspace, create an
+        // image layer. Otherwise write a delta layer.
+
+        // FIXME: deal with the case of lots of values for same key
+
+        // FIXME: we are ignoring images here. Did we already divide the work
+        // so that we won't encounter them here?
+
+        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+        for layer_id in &job.input_layers {
+            let l = &self.layers[layer_id.0];
+            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
+                deltas.push(dl.clone());
+            }
+        }
+        // Open stream
+        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut all_in_window: bool = false;
+        let mut window = Window::new();
+        loop {
+            if all_in_window && window.elems.is_empty() {
+                // All done!
+                break;
+            }
+            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
+            {
+                let batch_layers: Vec<LayerId> = job
+                    .input_layers
+                    .iter()
+                    .filter(|layer_id| {
+                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                    })
+                    .cloned()
+                    .collect();
+                assert!(!batch_layers.is_empty());
+                new_jobs.push(CompactionJob {
+                    key_range,
+                    lsn_range: job.lsn_range.clone(),
+                    strategy: CompactionStrategy::CreateDelta,
+                    input_layers: batch_layers,
+                    completed: false,
+                });
+            } else {
+                assert!(!all_in_window);
+                if let Some(next_key) = key_accum.next().await.transpose()? {
+                    window.feed(next_key.key, next_key.size);
+                } else {
+                    all_in_window = true;
+                }
+            }
+        }
+
+        // All the input files are rewritten. Set up the tracking for when they can
+        // be deleted.
+        for layer_id in job.input_layers.iter() {
+            let l = &mut self.layers[layer_id.0];
+            assert!(l.deletable_after.is_none());
+            l.deletable_after = Some(PendingJobSet::new());
+        }
+        for j in new_jobs.into_iter().rev() {
+            let job_id = self.push_job(j);
+            let j = &self.jobs[job_id.0];
+            for layer_id in j.input_layers.iter() {
+                self.layers[layer_id.0]
+                    .deletable_after
+                    .as_mut()
+                    .unwrap()
+                    .pending
+                    .insert(job_id);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+// Sliding window through keyspace and values
+// This is used by over_with_images to decide on good split points
+struct KeyspaceWindow<K> {
+    head: KeyspaceWindowHead<K>,
+
+    start_pos: KeyspaceWindowPos<K>,
+}
+struct KeyspaceWindowHead<K> {
+    // overall key range to cover
+    key_range: Range<K>,
+
+    keyspace: Vec<Range<K>>,
+    target_keysize: u64,
+}
+
+#[derive(Clone)]
+struct KeyspaceWindowPos<K> {
+    end_key: K,
+
+    keyspace_idx: usize,
+
+    accum_keysize: u64,
+}
+impl<K: CompactionKey> KeyspaceWindowPos<K> {
+    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
+        self.keyspace_idx == w.keyspace.len()
+    }
+
+    // Advance the cursor until it reaches 'target_keysize'.
+    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+        while self.accum_keysize < max_size && !self.reached_end(w) {
+            let curr_range = &w.keyspace[self.keyspace_idx];
+            if self.end_key < curr_range.start {
+                // skip over any unused space
+                self.end_key = curr_range.start;
+            }
+
+            // We're now within 'curr_range'. Can we advance past it completely?
+            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            if (self.accum_keysize + distance as u64) < max_size {
+                // oh yeah, it fits
+                self.end_key = curr_range.end;
+                self.keyspace_idx += 1;
+                self.accum_keysize += distance as u64;
+            } else {
+                // advance within the range
+                let skip_key = self.end_key.skip_some();
+                let distance = K::key_range_size(&(self.end_key..skip_key));
+                if (self.accum_keysize + distance as u64) < max_size {
+                    self.end_key = skip_key;
+                    self.accum_keysize += distance as u64;
+                } else {
+                    self.end_key = self.end_key.next();
+                    self.accum_keysize += 1;
+                }
+            }
+        }
+    }
+}
+
+impl<K> KeyspaceWindow<K>
+where
+    K: CompactionKey,
+{
+    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
+        assert!(keyspace.first().unwrap().start >= key_range.start);
+
+        let start_key = key_range.start;
+        let start_pos = KeyspaceWindowPos::<K> {
+            end_key: start_key,
+            keyspace_idx: 0,
+            accum_keysize: 0,
+        };
+        Self {
+            head: KeyspaceWindowHead::<K> {
+                key_range,
+                keyspace,
+                target_keysize,
+            },
+            start_pos,
+        }
+    }
+
+    fn choose_next_image(&mut self) -> Option<Range<K>> {
+        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
+            // we've reached the end
+            return None;
+        }
+
+        let mut next_pos = self.start_pos.clone();
+        next_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + self.head.target_keysize,
+        );
+
+        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
+        // 1.25x target size
+        let mut end_pos = next_pos.clone();
+        end_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
+        );
+        if end_pos.reached_end(&self.head) {
+            // gobble up any unused keyspace between the last used key and end of the range
+            assert!(end_pos.end_key <= self.head.key_range.end);
+            end_pos.end_key = self.head.key_range.end;
+            next_pos = end_pos;
+        }
+
+        let start_key = self.start_pos.end_key;
+        self.start_pos = next_pos;
+        Some(start_key..self.start_pos.end_key)
+    }
+}
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
+//
+// Candidates:
+//
+// 1. Create an image layer, snapping to previous images
+// 2. Create a delta layer, snapping to previous images
+// 3. Create an image layer, snapping to
+//
+//
+
+// Take previous partitioning, based on the image layers below.
+//
+// Candidate is at the front:
+//
+// Consider stretching an image layer to next divider? If it's close enough,
+// that's the image candidate
+//
+// If it's too far, consider splitting at a reasonable point
+//
+// Is the image candidate smaller than the equivalent delta? If so,
+// split off the image. Otherwise, split off one delta.
+// Try to snap off the delta at a reasonable point
+
+struct WindowElement<K> {
+    start_key: K, // inclusive
+    last_key: K,  // inclusive
+    accum_size: u64,
+}
+struct Window<K> {
+    elems: VecDeque<WindowElement<K>>,
+
+    // last key that was split off, inclusive
+    splitoff_key: Option<K>,
+    splitoff_size: u64,
+}
+
+impl<K> Window<K>
+where
+    K: CompactionKey,
+{
+    fn new() -> Self {
+        Self {
+            elems: VecDeque::new(),
+            splitoff_key: None,
+            splitoff_size: 0,
+        }
+    }
+
+    fn feed(&mut self, key: K, size: u64) {
+        let last_size;
+        if let Some(last) = self.elems.back_mut() {
+            assert!(last.last_key <= key);
+            if key == last.last_key {
+                last.accum_size += size;
+                return;
+            }
+            last_size = last.accum_size;
+        } else {
+            last_size = 0;
+        }
+        // This is a new key.
+        let elem = WindowElement {
+            start_key: key,
+            last_key: key,
+            accum_size: last_size + size,
+        };
+        self.elems.push_back(elem);
+    }
+
+    fn remain_size(&self) -> u64 {
+        self.elems.back().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn peek_size(&self) -> u64 {
+        self.elems.front().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn commit_upto(&mut self, mut upto: usize) {
+        while upto > 1 {
+            let popped = self.elems.pop_front().unwrap();
+            self.elems.front_mut().unwrap().start_key = popped.start_key;
+            upto -= 1;
+        }
+    }
+
+    fn find_size_split(&self, target_size: u64) -> usize {
+        self.elems
+            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
+    }
+
+    fn pop(&mut self) {
+        let first = self.elems.pop_front().unwrap();
+        self.splitoff_size = first.accum_size;
+
+        self.splitoff_key = Some(first.last_key);
+    }
+
+    // the difference between delta and image is that an image covers
+    // any unused keyspace before and after, while a delta tries to
+    // minimize that. TODO: difference not implemented
+    fn pop_delta(&mut self) -> Range<K> {
+        let first = self.elems.front().unwrap();
+        let key_range = first.start_key..first.last_key.next();
+
+        self.pop();
+        key_range
+    }
+
+    // Prerequisite: we have enough input in the window
+    //
+    // On return None, the caller should feed more data and call again
+    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
+        if has_more && self.elems.is_empty() {
+            // Starting up
+            return None;
+        }
+
+        // If we still have an undersized candidate, just keep going
+        while self.peek_size() < target_size {
+            if self.elems.len() > 1 {
+                self.commit_upto(2);
+            } else if has_more {
+                return None;
+            } else {
+                break;
+            }
+        }
+
+        // Ensure we have enough input in the window to make a good decision
+        if has_more && self.remain_size() < target_size * 5 / 4 {
+            return None;
+        }
+
+        // The candidate on the front is now large enough, for a delta.
+        // And we have enough data in the window to decide.
+
+        // If we're willing to stretch it up to 1.25 target size, could we
+        // gobble up the rest of the work? This avoids creating very small
+        // "tail" layers at the end of the keyspace
+        if !has_more && self.remain_size() < target_size * 5 / 3 {
+            self.commit_upto(self.elems.len());
+        } else {
+            let delta_split_at = self.find_size_split(target_size);
+            self.commit_upto(delta_split_at);
+
+            // If it's still not large enough, request the caller to fill the window
+            if self.elems.len() == 1 && has_more {
+                return None;
+            }
+        }
+        Some(self.pop_delta())
+    }
+}
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -0,0 +1,243 @@
+//! This file contains generic utility functions over the interface types,
+//! which could be handy for any compaction implementation.
+use crate::interface::*;
+
+use futures::future::BoxFuture;
+use futures::{Stream, StreamExt};
+use itertools::Itertools;
+use pin_project_lite::pin_project;
+use std::cmp::Ord;
+use std::collections::BinaryHeap;
+use std::collections::VecDeque;
+use std::future::Future;
+use std::ops::{DerefMut, Range};
+use std::pin::Pin;
+use std::task::{ready, Poll};
+
+pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+where
+    K: CompactionKey,
+{
+    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+}
+
+pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    !(a.end <= b.start || b.end <= a.start)
+}
+
+pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
+    let x = std::mem::take(a);
+    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
+        .into_iter()
+        .kmerge_by(|a, b| a.start < b.start);
+    let mut ranges = Vec::new();
+    if let Some(first) = all_ranges_iter.next() {
+        let (mut start, mut end) = (first.start, first.end);
+
+        for r in all_ranges_iter {
+            assert!(r.start >= start);
+            if r.start > end {
+                ranges.push(start..end);
+                start = r.start;
+                end = r.end;
+            } else if r.end > end {
+                end = r.end;
+            }
+        }
+        ranges.push(start..end);
+    }
+    *a = ranges
+}
+
+pub fn intersect_keyspace<K: Ord + Clone + Copy>(
+    a: &CompactionKeySpace<K>,
+    r: &Range<K>,
+) -> CompactionKeySpace<K> {
+    let mut ranges: Vec<Range<K>> = Vec::new();
+
+    for x in a.iter() {
+        if x.end <= r.start {
+            continue;
+        }
+        if x.start >= r.end {
+            break;
+        }
+        ranges.push(x.clone())
+    }
+
+    // trim the ends
+    if let Some(first) = ranges.first_mut() {
+        first.start = std::cmp::max(first.start, r.start);
+    }
+    if let Some(last) = ranges.last_mut() {
+        last.end = std::cmp::min(last.end, r.end);
+    }
+    ranges
+}
+
+/// Create a stream that iterates through all DeltaEntrys among all input
+/// layers, in key-lsn order.
+///
+/// This is public because the create_delta() implementation likely wants to use this too
+/// TODO: move to a more shared place
+pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
+    layers: &'a [E::DeltaLayer],
+    ctx: &'a E::RequestContext,
+) -> MergeDeltaKeys<'a, E> {
+    // Use a binary heap to merge the layers. Each input layer is initially
+    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
+    // the layer's key range as the key. The first time a layer reaches the top
+    // of the heap, all the keys of the layer are loaded into a sorted vector.
+    //
+    // This helps to keep the memory usage reasonable: we only need to hold in
+    // memory the DeltaEntrys of the layers that overlap with the "current" key.
+    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
+    for l in layers {
+        heap.push(LazyLoadLayer::Unloaded(l));
+    }
+    MergeDeltaKeys {
+        heap,
+        ctx,
+        load_future: None,
+    }
+}
+
+enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
+    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
+    Unloaded(&'a E::DeltaLayer),
+}
+impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
+    fn key(&self) -> E::Key {
+        match self {
+            Self::Loaded(entries) => entries.front().unwrap().key(),
+            Self::Unloaded(dl) => dl.key_range().start,
+        }
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // reverse order so that we get a min-heap
+        other.key().cmp(&self.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+    fn eq(&self, other: &Self) -> bool {
+        self.key().eq(&other.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
+
+type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
+
+// Stream returned by `merge_delta_keys`
+pin_project! {
+#[allow(clippy::type_complexity)]
+pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
+    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
+
+    #[pin]
+    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
+
+    ctx: &'a E::RequestContext,
+}
+}
+
+impl<'a, E> Stream for MergeDeltaKeys<'a, E>
+where
+    E: CompactionJobExecutor + 'a,
+{
+    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
+
+    fn poll_next(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
+        let mut this = self.project();
+        loop {
+            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
+                // We are waiting for loading the keys to finish
+                match ready!(load_future.as_mut().poll(cx)) {
+                    Ok(entries) => {
+                        this.load_future.set(None);
+                        *this.heap.peek_mut().unwrap() =
+                            LazyLoadLayer::Loaded(VecDeque::from(entries));
+                    }
+                    Err(e) => {
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                }
+            }
+
+            // If the topmost layer in the heap hasn't been loaded yet, start
+            // loading it. Otherwise return the next entry from it and update
+            // the layer's position in the heap (this decreaseKey operation is
+            // performed implicitly when `top` is dropped).
+            if let Some(mut top) = this.heap.peek_mut() {
+                match top.deref_mut() {
+                    LazyLoadLayer::Unloaded(ref mut l) => {
+                        let fut = l.load_keys(this.ctx);
+                        this.load_future.set(Some(fut));
+                        continue;
+                    }
+                    LazyLoadLayer::Loaded(ref mut entries) => {
+                        let result = entries.pop_front().unwrap();
+                        if entries.is_empty() {
+                            std::collections::binary_heap::PeekMut::pop(top);
+                        }
+                        return Poll::Ready(Some(Ok(result)));
+                    }
+                }
+            } else {
+                return Poll::Ready(None);
+            }
+        }
+    }
+}
+
+// Accumulate values at key boundaries
+pub struct KeySize<K> {
+    pub key: K,
+    pub num_values: u64,
+    pub size: u64,
+}
+
+pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+where
+    K: Eq,
+    I: Stream<Item = Result<D, E>>,
+    D: CompactionDeltaEntry<'a, K>,
+{
+    async_stream::try_stream! {
+        // Initialize the state from the first value
+        let mut input = std::pin::pin!(input);
+
+        if let Some(first) = input.next().await {
+            let first = first?;
+            let mut accum: KeySize<K> = KeySize {
+                key: first.key(),
+                num_values: 1,
+                size: first.size(),
+            };
+            while let Some(this) = input.next().await {
+                let this = this?;
+                if this.key() == accum.key {
+                    accum.size += this.size();
+                    accum.num_values += 1;
+                } else {
+                    yield accum;
+                    accum = KeySize {
+                        key: this.key(),
+                        num_values: 1,
+                        size: this.size(),
+                    };
+                }
+            }
+            yield accum;
+        }
+    }
+}
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -0,0 +1,376 @@
+//! An LSM tree consists of multiple levels, each exponential larger than the
+//! previous level. And each level consists of be multiple "tiers". With tiered
+//! compaction, a level is compacted when it has accumulated more than N tiers,
+//! forming one tier on the next level.
+//!
+//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
+//! we identify them by looking at the shapes of the layers. It's an easy task
+//! for a human, but it's not straightforward to come up with the exact
+//! rules. Especially if there are cases like interrupted, half-finished
+//! compactions, or highly skewed data distributions that have let us "skip"
+//! some levels. It's not critical to classify all cases correctly; at worst we
+//! delay some compaction work, and suffer from more read amplification, or we
+//! perform some unnecessary compaction work.
+//!
+//! `identify_level` performs that shape-matching.
+//!
+//! It returns a Level struct, which has `depth()` function to count the number
+//! of "tiers" in the level. The tier count is the max depth of stacked layers
+//! within the level. That's a good measure, because the point of compacting is
+//! to reduce read amplification, and the depth is what determines that.
+//!
+//! One interesting effect of this is that if we generate very small delta
+//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
+//! because they reach the target size, the L0 compaction will combine them to
+//! one larger file. But if the combined file is still smaller than the target
+//! file size, the file will still be considered to be part of L0 at the next
+//! iteration.
+
+use anyhow::bail;
+use std::collections::BTreeSet;
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+use crate::interface::*;
+
+use tracing::{info, trace};
+
+pub struct Level<L> {
+    pub lsn_range: Range<Lsn>,
+    pub layers: Vec<L>,
+}
+
+/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
+/// no layers that cross the boundary LSN.
+///
+/// A further restriction is that all layers in the returned partition cover at
+/// most 'lsn_max_size' LSN bytes.
+pub async fn identify_level<K, L>(
+    all_layers: Vec<L>,
+    end_lsn: Lsn,
+    lsn_max_size: u64,
+) -> anyhow::Result<Option<Level<L>>>
+where
+    K: CompactionKey,
+    L: CompactionLayer<K> + Clone,
+{
+    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
+    let mut layers = Vec::new();
+    for l in all_layers {
+        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
+            // shouldn't happen. Indicates that the caller passed a bogus
+            // end_lsn.
+            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
+        }
+        // include image layers sitting exacty at `end_lsn`.
+        let is_image = !l.is_delta();
+        if (is_image && l.lsn_range().start > end_lsn)
+            || (!is_image && l.lsn_range().start >= end_lsn)
+        {
+            continue;
+        }
+        layers.push(l);
+    }
+    // All the remaining layers either belong to this level, or are below it.
+    info!(
+        "identify level at {}, size {}, num layers below: {}",
+        end_lsn,
+        lsn_max_size,
+        layers.len()
+    );
+    if layers.is_empty() {
+        return Ok(None);
+    }
+
+    // Walk the ranges in LSN order.
+    //
+    // ----- end_lsn
+    //  |
+    //  |
+    //  v
+    //
+    layers.sort_by_key(|l| l.lsn_range().end);
+    let mut candidate_start_lsn = end_lsn;
+    let mut candidate_layers: Vec<L> = Vec::new();
+    let mut current_best_start_lsn = end_lsn;
+    let mut current_best_layers: Vec<L> = Vec::new();
+    let mut iter = layers.into_iter();
+    loop {
+        let Some(l) = iter.next_back() else {
+            // Reached end. Accept the last candidate
+            current_best_start_lsn = candidate_start_lsn;
+            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+            break;
+        };
+        trace!(
+            "inspecting {} for candidate {}, current best {}",
+            l.short_id(),
+            candidate_start_lsn,
+            current_best_start_lsn
+        );
+
+        let r = l.lsn_range();
+
+        // Image layers don't restrict our choice of cutoff LSN
+        if l.is_delta() {
+            // Is this candidate workable? In other words, are there any
+            // delta layers that span across this LSN
+            //
+            // Valid:                 Not valid:
+            //  +                     +
+            //  |                     | +
+            //  +  <- candidate       + |   <- candidate
+            //     +                    +
+            //     |
+            //     +
+            if r.end <= candidate_start_lsn {
+                // Hooray, there are no crossing LSNs. And we have visited
+                // through all the layers within candidate..end_lsn. The
+                // current candidate can be accepted.
+                current_best_start_lsn = r.end;
+                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+                candidate_start_lsn = r.start;
+            }
+
+            // Is it small enough to be considered part of this level?
+            if r.end.0 - r.start.0 > lsn_max_size {
+                // Too large, this layer belongs to next level. Stop.
+                trace!(
+                    "too large {}, size {} vs {}",
+                    l.short_id(),
+                    r.end.0 - r.start.0,
+                    lsn_max_size
+                );
+                break;
+            }
+
+            // If this crosses the candidate lsn, push it down.
+            if r.start < candidate_start_lsn {
+                trace!(
+                    "layer {} prevents from stopping at {}",
+                    l.short_id(),
+                    candidate_start_lsn
+                );
+                candidate_start_lsn = r.start;
+            }
+        }
+
+        // Include this layer in our candidate
+        candidate_layers.push(l);
+    }
+
+    Ok(if current_best_start_lsn == end_lsn {
+        // empty level
+        None
+    } else {
+        Some(Level {
+            lsn_range: current_best_start_lsn..end_lsn,
+            layers: current_best_layers,
+        })
+    })
+}
+
+// helper struct used in depth()
+struct Event<K> {
+    key: K,
+    layer_idx: usize,
+    start: bool,
+}
+
+impl<L> Level<L> {
+    /// Count the number of deltas stacked on each other.
+    pub fn depth<K>(&self) -> u64
+    where
+        K: CompactionKey,
+        L: CompactionLayer<K>,
+    {
+        let mut events: Vec<Event<K>> = Vec::new();
+        for (idx, l) in self.layers.iter().enumerate() {
+            events.push(Event {
+                key: l.key_range().start,
+                layer_idx: idx,
+                start: true,
+            });
+            events.push(Event {
+                key: l.key_range().end,
+                layer_idx: idx,
+                start: false,
+            });
+        }
+        events.sort_by_key(|e| (e.key, e.start));
+
+        // Sweep the key space left to right. Stop at each distinct key, and
+        // count the number of deltas on top of the highest image at that key.
+        //
+        // This is a little enefficient, as we walk through the active_set on
+        // every key. We could increment/decrement a counter on each step
+        // instead, but that'd require a bit more complex bookkeeping.
+        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
+        let mut max_depth = 0;
+        let mut events_iter = events.iter().peekable();
+        while let Some(e) = events_iter.next() {
+            let l = &self.layers[e.layer_idx];
+            let is_image = !l.is_delta();
+
+            // update the active set
+            if e.start {
+                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
+            } else {
+                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
+            }
+
+            // recalculate depth if this was the last event at this point
+            let more_events_at_this_key = events_iter
+                .peek()
+                .map_or(false, |next_e| next_e.key == e.key);
+            if !more_events_at_this_key {
+                let mut active_depth = 0;
+                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
+                    if *is_image {
+                        break;
+                    }
+                    active_depth += 1;
+                }
+                if active_depth > max_depth {
+                    max_depth = active_depth;
+                }
+            }
+        }
+        max_depth
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
+    use std::sync::{Arc, Mutex};
+
+    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
+        MockLayer::Delta(Arc::new(MockDeltaLayer {
+            key_range,
+            lsn_range,
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+            records: vec![],
+        }))
+    }
+
+    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
+        MockLayer::Image(Arc::new(MockImageLayer {
+            key_range,
+            lsn_range: lsn..(lsn + 1),
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+        }))
+    }
+
+    #[tokio::test]
+    async fn test_identify_level() -> anyhow::Result<()> {
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
+        ];
+
+        // All layers fit in the max file size
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 6);
+
+        // Same LSN with smaller max file size. The second layer from the top is larger
+        // and belongs to next level.
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        // Call with a smaller LSN
+        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 2);
+
+        // Call with an LSN that doesn't partition the space
+        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
+        // The files LSN ranges overlap, so even though there are more files that
+        // fit under the file size, they are not included in the level because they
+        // overlap so that we'd need to include the oldest file, too, which is
+        // larger
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
+        // The key ranges don't overlap, so depth is only 1.
+        let layers = vec![
+            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
+            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 1);
+
+        // Staggered. The 1st and 3rd layer don't overlap with each other.
+        let layers = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 2);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_images() -> anyhow::Result<()> {
+        let layers: Vec<MockLayer> = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+            // This covers the same key range as the 2nd delta layer. The depth
+            // in that key range is therefore 0.
+            image(1500..2500, Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 4);
+        assert_eq!(level.depth(), 1);
+        Ok(())
+    }
+}
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -0,0 +1,167 @@
+//! This is what the compaction implementation needs to know about
+//! layers, keyspace etc.
+//!
+//! All the heavy lifting is done by the create_image and create_delta
+//! functions that the implementor provides.
+use async_trait::async_trait;
+use pageserver_api::{key::Key, keyspace::key_range_size};
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+/// Public interface. This is the main thing that the implementor needs to provide
+#[async_trait]
+pub trait CompactionJobExecutor {
+    // Type system.
+    //
+    // We assume that there are two kinds of layers, deltas and images. The
+    // compaction doesn't distinguish whether they are stored locally or
+    // remotely.
+    //
+    // The keyspace is defined by CompactionKey trait.
+    //
+    type Key: CompactionKey;
+
+    type Layer: CompactionLayer<Self::Key> + Clone;
+    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
+    type ImageLayer: CompactionImageLayer<Self> + Clone;
+
+    // This is passed through to all the interface functions. The compaction
+    // implementation doesn't do anything with it, but it might be useful for
+    // the interface implementation.
+    type RequestContext: CompactionRequestContext;
+
+    // ----
+    // Functions that the planner uses to support its decisions
+    // ----
+
+    /// Return all layers that overlap the given bounding box.
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>>;
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn: Lsn,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+
+    /// NB: This is a pretty expensive operation. In the real pageserver
+    /// implementation, it downloads the layer, and keeps it resident
+    /// until the DeltaLayer is dropped.
+    async fn downcast_delta_layer(
+        &self,
+        layer: &Self::Layer,
+    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+
+    // ----
+    // Functions to execute the plan
+    // ----
+
+    /// Create a new image layer, materializing all the values in the key range,
+    /// at given 'lsn'.
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Self::Key>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Create a new delta layer, containing all the values from 'input_layers'
+    /// in the given key and LSN range.
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Self::Key>,
+        input_layers: &[Self::DeltaLayer],
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Delete a layer. The compaction implementation will call this only after
+    /// all the create_image() or create_delta() calls that deletion of this
+    /// layer depends on have finished. But if the implementor has extra lazy
+    /// background tasks, like uploading the index json file to remote storage,
+    /// it is the implementation's responsibility to track those.
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+}
+
+pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
+    const MIN: Self;
+    const MAX: Self;
+
+    /// Calculate distance between key_range.start and key_range.end.
+    ///
+    /// This returns u32, for compatibility with Repository::key. If the
+    /// distance is larger, return u32::MAX.
+    fn key_range_size(key_range: &Range<Self>) -> u32;
+
+    // return "self + 1"
+    fn next(&self) -> Self;
+
+    // return "self + <some decent amount to skip>". The amount to skip
+    // is left to the implementation.
+    // FIXME: why not just "add(u32)" ?  This is hard to use
+    fn skip_some(&self) -> Self;
+}
+
+impl CompactionKey for Key {
+    const MIN: Self = Self::MIN;
+    const MAX: Self = Self::MAX;
+
+    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
+        key_range_size(r)
+    }
+    fn next(&self) -> Key {
+        (self as &Key).next()
+    }
+    fn skip_some(&self) -> Key {
+        self.add(128)
+    }
+}
+
+/// Contiguous ranges of keys that belong to the key space. In key order, and
+/// with no overlap.
+pub type CompactionKeySpace<K> = Vec<Range<K>>;
+
+/// Functions needed from all layers.
+pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+    fn key_range(&self) -> &Range<K>;
+    fn lsn_range(&self) -> &Range<Lsn>;
+
+    fn file_size(&self) -> u64;
+
+    /// For debugging, short human-readable representation of the layer. E.g. filename.
+    fn short_id(&self) -> String;
+
+    fn is_delta(&self) -> bool;
+}
+
+#[async_trait]
+pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
+    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
+    where
+        Self: 'a;
+
+    /// Return all keys in this delta layer.
+    async fn load_keys<'a>(
+        &self,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
+}
+
+pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
+
+pub trait CompactionDeltaEntry<'a, K> {
+    fn key(&self) -> K;
+    fn lsn(&self) -> Lsn;
+    fn size(&self) -> u64;
+}
+
+pub trait CompactionRequestContext {}
--- a/pageserver/compaction/src/lib.rs
+++ b/pageserver/compaction/src/lib.rs
@@ -0,0 +1,12 @@
+// The main module implementing the compaction algorithm
+pub mod compact_tiered;
+pub(crate) mod identify_levels;
+
+// Traits that the caller of the compaction needs to implement
+pub mod interface;
+
+// Utility functions, useful for the implementation
+pub mod helpers;
+
+// A simulator with mock implementations of 'interface'
+pub mod simulator;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -0,0 +1,613 @@
+mod draw;
+
+use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
+
+use async_trait::async_trait;
+use futures::StreamExt;
+use rand::Rng;
+use tracing::info;
+
+use utils::lsn::Lsn;
+
+use std::fmt::Write;
+use std::ops::Range;
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::helpers::{merge_delta_keys, overlaps_with};
+
+use crate::interface;
+use crate::interface::CompactionLayer;
+
+//
+// Implementation for the CompactionExecutor interface
+//
+pub struct MockTimeline {
+    // Parameters for the compaction algorithm
+    pub target_file_size: u64,
+    tiers_per_level: u64,
+
+    num_l0_flushes: u64,
+    last_compact_at_flush: u64,
+    last_flush_lsn: Lsn,
+
+    // In-memory layer
+    records: Vec<MockRecord>,
+    total_len: u64,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+
+    // Current keyspace at `end_lsn`. This is updated on every ingested record.
+    keyspace: KeySpace,
+
+    // historic keyspaces
+    old_keyspaces: Vec<(Lsn, KeySpace)>,
+
+    // "on-disk" layers
+    pub live_layers: Vec<MockLayer>,
+
+    num_deleted_layers: u64,
+
+    // Statistics
+    wal_ingested: u64,
+    bytes_written: u64,
+    bytes_deleted: u64,
+    layers_created: u64,
+    layers_deleted: u64,
+
+    // All the events - creation and deletion of files - are collected
+    // in 'history'. It is used to draw the SVG animation at the end.
+    time: u64,
+    history: Vec<draw::LayerTraceEvent>,
+}
+
+type KeySpace = interface::CompactionKeySpace<Key>;
+
+pub struct MockRequestContext {}
+impl interface::CompactionRequestContext for MockRequestContext {}
+
+pub type Key = u64;
+
+impl interface::CompactionKey for Key {
+    const MIN: Self = u64::MIN;
+    const MAX: Self = u64::MAX;
+
+    fn key_range_size(key_range: &Range<Self>) -> u32 {
+        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
+    }
+
+    fn next(&self) -> Self {
+        self + 1
+    }
+    fn skip_some(&self) -> Self {
+        // round up to next xx
+        self + 100
+    }
+}
+
+#[derive(Clone)]
+pub struct MockRecord {
+    lsn: Lsn,
+    key: Key,
+    len: u64,
+}
+
+impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
+    fn key(&self) -> Key {
+        self.key
+    }
+    fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+    fn size(&self) -> u64 {
+        self.len
+    }
+}
+
+pub struct MockDeltaLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+
+    pub records: Vec<MockRecord>,
+}
+
+impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}-{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+#[async_trait]
+impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
+    type DeltaEntry<'a> = MockRecord;
+
+    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
+        Ok(self.records.clone())
+    }
+}
+
+pub struct MockImageLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+}
+
+impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
+
+impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        false
+    }
+}
+
+impl MockTimeline {
+    pub fn new() -> Self {
+        MockTimeline {
+            target_file_size: 256 * 1024 * 1024,
+            tiers_per_level: 4,
+
+            num_l0_flushes: 0,
+            last_compact_at_flush: 0,
+            last_flush_lsn: Lsn(0),
+
+            records: Vec::new(),
+            total_len: 0,
+            start_lsn: Lsn(1000),
+            end_lsn: Lsn(1000),
+            keyspace: KeySpace::new(),
+
+            old_keyspaces: vec![],
+
+            live_layers: vec![],
+
+            num_deleted_layers: 0,
+
+            wal_ingested: 0,
+            bytes_written: 0,
+            bytes_deleted: 0,
+            layers_created: 0,
+            layers_deleted: 0,
+
+            time: 0,
+            history: Vec::new(),
+        }
+    }
+
+    pub async fn compact(&mut self) -> anyhow::Result<()> {
+        let ctx = MockRequestContext {};
+
+        crate::compact_tiered::compact_tiered(
+            self,
+            self.last_flush_lsn,
+            self.target_file_size,
+            self.tiers_per_level,
+            &ctx,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    // Ingest one record to the timeline
+    pub fn ingest_record(&mut self, key: Key, len: u64) {
+        self.records.push(MockRecord {
+            lsn: self.end_lsn,
+            key,
+            len,
+        });
+        self.total_len += len;
+        self.end_lsn += len;
+
+        if self.total_len > self.target_file_size {
+            self.flush_l0();
+        }
+    }
+
+    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
+        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
+            self.compact().await?;
+            self.last_compact_at_flush = self.num_l0_flushes;
+        }
+        Ok(())
+    }
+
+    pub fn flush_l0(&mut self) {
+        if self.records.is_empty() {
+            return;
+        }
+
+        let mut records = std::mem::take(&mut self.records);
+        records.sort_by_key(|rec| rec.key);
+
+        let lsn_range = self.start_lsn..self.end_lsn;
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: Key::MIN..Key::MAX,
+            lsn_range: lsn_range.clone(),
+            file_size: self.total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!("flushed L0 layer {}", new_layer.short_id());
+        self.live_layers.push(MockLayer::from(&new_layer));
+
+        // reset L0
+        self.start_lsn = self.end_lsn;
+        self.total_len = 0;
+        self.records = Vec::new();
+
+        self.layers_created += 1;
+        self.bytes_written += new_layer.file_size;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Flush,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        self.num_l0_flushes += 1;
+        self.last_flush_lsn = self.end_lsn;
+    }
+
+    // Ingest `num_records' records to the timeline, with random keys
+    // uniformly distributed in `key_range`
+    pub fn ingest_uniform(
+        &mut self,
+        num_records: u64,
+        len: u64,
+        key_range: &Range<Key>,
+    ) -> anyhow::Result<()> {
+        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
+        let mut rng = rand::thread_rng();
+        for _ in 0..num_records {
+            self.ingest_record(rng.gen_range(key_range.clone()), len);
+            self.wal_ingested += len;
+        }
+        Ok(())
+    }
+
+    pub fn stats(&self) -> anyhow::Result<String> {
+        let mut s = String::new();
+
+        writeln!(s, "STATISTICS:")?;
+        writeln!(
+            s,
+            "WAL ingested:   {:>10} MB",
+            self.wal_ingested / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size created:   {:>10} MB",
+            self.bytes_written / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size deleted:   {:>10} MB",
+            self.bytes_deleted / (1024 * 1024)
+        )?;
+        writeln!(s, "files created:     {:>10}", self.layers_created)?;
+        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
+        writeln!(
+            s,
+            "write amp:         {:>10.2}",
+            self.bytes_written as f64 / self.wal_ingested as f64
+        )?;
+        writeln!(
+            s,
+            "storage amp:       {:>10.2}",
+            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
+        )?;
+
+        Ok(s)
+    }
+
+    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
+        draw::draw_history(&self.history, output)
+    }
+}
+
+impl Default for MockTimeline {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Clone)]
+pub enum MockLayer {
+    Delta(Arc<MockDeltaLayer>),
+    Image(Arc<MockImageLayer>),
+}
+
+impl interface::CompactionLayer<Key> for MockLayer {
+    fn key_range(&self) -> &Range<Key> {
+        match self {
+            MockLayer::Delta(this) => this.key_range(),
+            MockLayer::Image(this) => this.key_range(),
+        }
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        match self {
+            MockLayer::Delta(this) => this.lsn_range(),
+            MockLayer::Image(this) => this.lsn_range(),
+        }
+    }
+    fn file_size(&self) -> u64 {
+        match self {
+            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size(),
+        }
+    }
+    fn short_id(&self) -> String {
+        match self {
+            MockLayer::Delta(this) => this.short_id(),
+            MockLayer::Image(this) => this.short_id(),
+        }
+    }
+
+    fn is_delta(&self) -> bool {
+        match self {
+            MockLayer::Delta(_) => true,
+            MockLayer::Image(_) => false,
+        }
+    }
+}
+
+impl MockLayer {
+    fn is_deleted(&self) -> bool {
+        let guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        *guard
+    }
+    fn mark_deleted(&self) {
+        let mut deleted_guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        assert!(!*deleted_guard, "layer already deleted");
+        *deleted_guard = true;
+    }
+}
+
+impl From<&Arc<MockDeltaLayer>> for MockLayer {
+    fn from(l: &Arc<MockDeltaLayer>) -> Self {
+        MockLayer::Delta(l.clone())
+    }
+}
+
+impl From<&Arc<MockImageLayer>> for MockLayer {
+    fn from(l: &Arc<MockImageLayer>) -> Self {
+        MockLayer::Image(l.clone())
+    }
+}
+
+#[async_trait]
+impl interface::CompactionJobExecutor for MockTimeline {
+    type Key = Key;
+    type Layer = MockLayer;
+    type DeltaLayer = Arc<MockDeltaLayer>;
+    type ImageLayer = Arc<MockImageLayer>;
+    type RequestContext = MockRequestContext;
+
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>> {
+        // Clear any deleted layers from our vec
+        self.live_layers.retain(|l| !l.is_deleted());
+
+        let layers: Vec<MockLayer> = self
+            .live_layers
+            .iter()
+            .filter(|l| {
+                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
+            })
+            .cloned()
+            .collect();
+
+        Ok(layers)
+    }
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        _lsn: Lsn,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
+        // find it in the levels
+        if self.old_keyspaces.is_empty() {
+            Ok(crate::helpers::intersect_keyspace(
+                &self.keyspace,
+                key_range,
+            ))
+        } else {
+            // not implemented
+
+            // The mock implementation only allows requesting the
+            // keyspace at the level's end LSN. That's all that the
+            // current implementation needs.
+            panic!("keyspace not available for requested lsn");
+        }
+    }
+
+    async fn downcast_delta_layer(
+        &self,
+        layer: &MockLayer,
+    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
+        Ok(match layer {
+            MockLayer::Delta(l) => Some(l.clone()),
+            MockLayer::Image(_) => None,
+        })
+    }
+
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
+
+        let mut accum_size: u64 = 0;
+        for r in keyspace {
+            accum_size += r.end - r.start;
+        }
+
+        let new_layer = Arc::new(MockImageLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn..lsn,
+            file_size: accum_size * 8192,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created image layer, size {}: {}",
+            new_layer.file_size,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Image(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += new_layer.file_size;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateImage,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Key>,
+        input_layers: &[Arc<MockDeltaLayer>],
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let mut key_value_stream =
+            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
+        let mut records: Vec<MockRecord> = Vec::new();
+        let mut total_len = 2;
+        while let Some(delta_entry) = key_value_stream.next().await {
+            let delta_entry: MockRecord = delta_entry?;
+            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
+                total_len += delta_entry.len;
+                records.push(delta_entry);
+            }
+        }
+        let total_records = records.len();
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn_range.clone(),
+            file_size: total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created delta layer, recs {}, size {}: {}",
+            total_records,
+            total_len,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += total_len;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateDelta,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        _ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let layer = std::pin::pin!(layer);
+        info!("deleting layer: {}", layer.short_id());
+        self.num_deleted_layers += 1;
+        self.bytes_deleted += layer.file_size();
+        layer.mark_deleted();
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Delete,
+            file: LayerTraceFile {
+                filename: layer.short_id(),
+                key_range: layer.key_range().clone(),
+                lsn_range: layer.lsn_range().clone(),
+            },
+        });
+
+        Ok(())
+    }
+}
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -0,0 +1,411 @@
+use super::Key;
+use anyhow::Result;
+use std::cmp::Ordering;
+use std::{
+    collections::{BTreeMap, BTreeSet, HashSet},
+    fmt::Write,
+    ops::Range,
+};
+use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
+use utils::lsn::Lsn;
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+struct CoordinateMap<T: Ord + Copy> {
+    map: BTreeMap<T, usize>,
+    stretch: f32,
+}
+
+impl<T: Ord + Copy> CoordinateMap<T> {
+    fn new(coords: Vec<T>, stretch: f32) -> Self {
+        let set: BTreeSet<T> = coords.into_iter().collect();
+
+        let mut map: BTreeMap<T, usize> = BTreeMap::new();
+        for (i, e) in set.iter().enumerate() {
+            map.insert(*e, i);
+        }
+
+        Self { map, stretch }
+    }
+
+    // This assumes that the map contains an exact point for this.
+    // Use map_inexact for values inbetween
+    fn map(&self, val: T) -> f32 {
+        *self.map.get(&val).unwrap() as f32 * self.stretch
+    }
+
+    // the value is still assumed to be within the min/max bounds
+    // (this is currently unused)
+    fn _map_inexact(&self, val: T) -> f32 {
+        let prev = *self.map.range(..=val).next().unwrap().1;
+        let next = *self.map.range(val..).next().unwrap().1;
+
+        // interpolate
+        (prev as f32 + (next - prev) as f32) * self.stretch
+    }
+
+    fn max(&self) -> f32 {
+        self.map.len() as f32 * self.stretch
+    }
+}
+
+#[derive(PartialEq, Hash, Eq)]
+pub enum LayerTraceOp {
+    Flush,
+    CreateDelta,
+    CreateImage,
+    Delete,
+}
+
+impl std::fmt::Display for LayerTraceOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        let op_str = match self {
+            LayerTraceOp::Flush => "flush",
+            LayerTraceOp::CreateDelta => "create_delta",
+            LayerTraceOp::CreateImage => "create_image",
+            LayerTraceOp::Delete => "delete",
+        };
+        f.write_str(op_str)
+    }
+}
+
+#[derive(PartialEq, Hash, Eq, Clone)]
+pub struct LayerTraceFile {
+    pub filename: String,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+}
+
+impl LayerTraceFile {
+    fn is_image(&self) -> bool {
+        self.lsn_range.end == self.lsn_range.start
+    }
+}
+
+pub struct LayerTraceEvent {
+    pub time_rel: u64,
+    pub op: LayerTraceOp,
+    pub file: LayerTraceFile,
+}
+
+pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
+    let mut files: Vec<LayerTraceFile> = Vec::new();
+
+    for event in history {
+        files.push(event.file.clone());
+    }
+    let last_time_rel = history.last().unwrap().time_rel;
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for f in files.iter() {
+        keys.push(f.key_range.start);
+        keys.push(f.key_range.end);
+        lsns.push(f.lsn_range.start);
+        lsns.push(f.lsn_range.end);
+    }
+
+    // Analyze
+    let key_map = CoordinateMap::new(keys, 2.0);
+    // Stretch out vertically for better visibility
+    let lsn_map = CoordinateMap::new(lsns, 3.0);
+
+    let mut svg = String::new();
+
+    // Draw
+    writeln!(
+        svg,
+        "{}",
+        BeginSvg {
+            w: key_map.max(),
+            h: lsn_map.max(),
+        }
+    )?;
+    let lsn_max = lsn_map.max();
+
+    // Sort the files by LSN, but so that image layers go after all delta layers
+    // The SVG is painted in the order the elements appear, and we want to draw
+    // image layers on top of the delta layers if they overlap
+    //
+    // (This could also be implemented via z coordinates: image layers get one z
+    // coord, delta layers get another z coord.)
+    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
+    files_sorted.sort_by(|a, b| {
+        if a.is_image() && !b.is_image() {
+            Ordering::Greater
+        } else if !a.is_image() && b.is_image() {
+            Ordering::Less
+        } else {
+            a.lsn_range.end.cmp(&b.lsn_range.end)
+        }
+    });
+
+    writeln!(svg, "<!-- layers -->")?;
+    let mut files_seen = HashSet::new();
+    for f in files_sorted {
+        if files_seen.contains(&f) {
+            continue;
+        }
+        let key_start = key_map.map(f.key_range.start);
+        let key_end = key_map.map(f.key_range.end);
+        let key_diff = key_end - key_start;
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = lsn_map.map(f.lsn_range.start);
+        let lsn_end = lsn_map.map(f.lsn_range.end);
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        let mut style = Style::default();
+        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
+
+        let y_start = lsn_max - lsn_start;
+        let y_end = lsn_max - lsn_end;
+
+        let x_margin = 0.25;
+        let y_margin = 0.5;
+
+        match f.lsn_range.start.cmp(&f.lsn_range.end) {
+            Ordering::Less => {
+                write!(
+                    svg,
+                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end + y_margin,
+                    key_diff - x_margin * 2.0,
+                    y_start - y_end - y_margin * 2.0,
+                    1.0, // border_radius,
+                    style,
+                )?;
+                write!(svg, "<title>{}</title>", f.filename)?;
+                writeln!(svg, "</rect>")?;
+            }
+            Ordering::Equal => {
+                //lsn_diff = 0.3;
+                //lsn_offset = -lsn_diff / 2.0;
+                //margin = 0.05;
+                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
+                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
+                write!(
+                    svg,
+                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end,
+                    key_end - x_margin,
+                    y_end,
+                    style,
+                )?;
+                write!(
+                    svg,
+                    "<title>{}<br>{} - {}</title>",
+                    f.filename, lsn_end, y_end
+                )?;
+                writeln!(svg, "</line>")?;
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+        files_seen.insert(f);
+    }
+
+    let mut record_style = Style::default();
+    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+    record_style.stroke = Stroke::None;
+
+    writeln!(svg, "{}", EndSvg)?;
+
+    let mut layer_events_str = String::new();
+    let mut first = true;
+    for e in history {
+        if !first {
+            writeln!(layer_events_str, ",")?;
+        }
+        write!(
+            layer_events_str,
+            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
+            e.time_rel, e.file.filename, e.op
+        )?;
+        first = false;
+    }
+    writeln!(layer_events_str)?;
+
+    writeln!(
+        output,
+        r#"<!DOCTYPE html>
+<html>
+<head>
+<style>
+/* Keep the slider pinned at top */
+.topbar {{
+  display: block;
+  overflow: hidden;
+  background-color: lightgrey;
+  position: fixed;
+  top: 0;
+  width: 100%;
+/*  width: 500px; */
+}}
+.slidercontainer {{
+  float: left;
+  width: 50%;
+  margin-right: 200px;
+}}
+.slider {{
+  float: left;
+  width: 100%;
+}}
+.legend {{
+  width: 200px;
+  float: right;
+}}
+
+/* Main content */
+.main {{
+  margin-top: 50px; /* Add a top margin to avoid content overlay */
+}}
+</style>
+</head>
+
+  <body onload="init()">
+    <script type="text/javascript">
+
+      var layer_events = [{layer_events_str}]
+
+      let ticker;
+
+      function init() {{
+          for (let i = 0; i < layer_events.length; i++) {{
+              var layer = document.getElementById("layer_" + layer_events[i].filename);
+              layer.style.visibility = "hidden";
+          }}
+          last_layer_event = -1;
+          moveSlider(last_slider_pos)
+      }}
+
+      function startAnimation() {{
+          ticker = setInterval(animateStep, 100);
+      }}
+      function stopAnimation() {{
+          clearInterval(ticker);
+      }}
+
+      function animateStep() {{
+          if (last_layer_event < layer_events.length - 1) {{
+              var slider = document.getElementById("time-slider");
+              let prevPos = slider.value
+              let nextEvent = last_layer_event + 1
+              while (nextEvent <= layer_events.length - 1) {{
+                  if (layer_events[nextEvent].time_rel > prevPos) {{
+                      break;
+                  }}
+                  nextEvent += 1;
+              }}
+              let nextPos = layer_events[nextEvent].time_rel
+              slider.value = nextPos
+              moveSlider(nextPos)
+          }}
+      }}
+
+      function redoLayerEvent(n, dir) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "visible";
+                  break;
+              case "delete":
+                  layer.style.visibility = "hidden";
+                  break;
+          }}
+      }}
+      function undoLayerEvent(n) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "delete":
+                  layer.style.visibility = "visible";
+                  break;
+          }}
+      }}
+
+      var last_slider_pos = 0
+      var last_layer_event = 0
+
+      var moveSlider = function(new_pos) {{
+          if (new_pos > last_slider_pos) {{
+              while (last_layer_event < layer_events.length - 1) {{
+                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
+                      break;
+                  }}
+                  last_layer_event += 1;
+                  redoLayerEvent(last_layer_event)
+              }}
+          }}
+          if (new_pos < last_slider_pos) {{
+              while (last_layer_event >= 0) {{
+                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
+                      break;
+                  }}
+                  undoLayerEvent(last_layer_event)
+                  last_layer_event -= 1;
+              }}
+          }}
+          last_slider_pos = new_pos;
+          document.getElementById("debug_pos").textContent=new_pos;
+          if (last_layer_event >= 0) {{
+              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
+          }} else {{
+              document.getElementById("debug_layer_event").textContent="begin";
+          }}
+      }}
+    </script>
+
+    <div class="topbar">
+      <div class="slidercontainer">
+        <label for="time-slider">TIME</label>:
+        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
+
+        pos: <span id="debug_pos"></span><br>
+        event: <span id="debug_layer_event"></span><br>
+        gc: <span id="debug_gc_event"></span><br>
+      </div>
+
+      <button onclick="startAnimation()">Play</button>
+      <button onclick="stopAnimation()">Stop</button>
+
+      <svg class="legend">
+        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+      </svg>
+    </div>
+
+    <div class="main">
+{svg}
+    </div>
+  </body>
+</html>
+"#
+    )?;
+
+    Ok(())
+}
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -0,0 +1,35 @@
+use pageserver_compaction::interface::CompactionLayer;
+use pageserver_compaction::simulator::MockTimeline;
+
+/// Test the extreme case that there are so many updates for a single key that
+/// even if we produce an extremely narrow delta layer, spanning just that one
+/// key, we still too many records to fit in the target file size. We need to
+/// split in the LSN dimension too in that case.
+///
+/// TODO: The code to avoid this problem has not been implemented yet! So the
+/// assertion currently fails, but we need to make it not fail.
+#[ignore]
+#[tokio::test]
+async fn test_many_updates_for_single_key() {
+    let mut executor = MockTimeline::new();
+    executor.target_file_size = 10_000_000; // 10 MB
+
+    // Ingest 100 MB of updates to a single key.
+    for _ in 1..1000 {
+        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
+        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.compact().await.unwrap();
+    }
+
+    // Check that all the layers are smaller than the target size (with some slop)
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+    for l in executor.live_layers.iter() {
+        assert!(l.file_size() < executor.target_file_size * 2);
+        // sanity check that none of the delta layers are stupidly small either
+        if l.is_delta() {
+            assert!(l.file_size() > executor.target_file_size / 2);
+        }
+    }
+}
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, str};

-use pageserver::page_cache::PAGE_SZ;
+use pageserver::page_cache::{self, PAGE_SZ};
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        file,
+        block_reader,
    );
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,13 +61,15 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        &file,
+        &block_reader,
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
@@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
            ctx,
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&file);
+    let cursor = BlockCursor::new_fileblockreader(&block_reader);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
 use rand::prelude::*;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{debug, info, instrument};
+use tracing::{info, instrument};

 use std::collections::HashMap;
 use std::num::NonZeroUsize;
@@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "localhost:64000")]
-    page_service_host_port: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    #[clap(long, default_value = "1")]
@@ -230,12 +230,9 @@ async fn client(
 ) {
    start_work_barrier.wait().await;

-    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
-        &args.page_service_host_port,
-        args.pageserver_jwt.as_deref(),
-    ))
-    .await
-    .unwrap();
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();

    while let Some(Work { lsn, gzip }) = work.recv().await {
        let start = Instant::now();
@@ -263,7 +260,7 @@ async fn client(
                }
            })
            .await;
-        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
        let elapsed = start.elapsed();
        live_stats.inc();
        STATS.with(|stats| {
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -3,7 +3,6 @@ use utils::logging;

 /// Re-usable pieces of code that aren't CLI-specific.
 mod util {
-    pub(crate) mod connstring;
    pub(crate) mod request_stats;
    #[macro_use]
    pub(crate) mod tokio_thread_local_stats;
--- a/pageserver/pagebench/src/util/connstring.rs
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -1,8 +0,0 @@
-pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
-    let colon_and_jwt = if let Some(jwt) = jwt {
-        format!(":{jwt}") // TODO: urlescape
-    } else {
-        String::new()
-    };
-    format!("postgres://postgres{colon_and_jwt}@{host_port}")
-}
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,7 +14,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
            format!(
                "JWT scope '{:?}' is ineligible for Pageserver auth",
                claims.scope
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -143,6 +143,7 @@ where
    ar: &'a mut Builder<&'b mut W>,
    buf: Vec<u8>,
    current_segment: Option<(SlruKind, u32)>,
+    total_blocks: usize,
 }

 impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
@@ -154,6 +155,7 @@ where
            ar,
            buf: Vec::new(),
            current_segment: None,
+            total_blocks: 0,
        }
    }

@@ -199,7 +201,8 @@ where
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
        self.ar.append(&header, self.buf.as_slice()).await?;

-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+        self.total_blocks += nblocks;
+        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);

        self.buf.clear();

@@ -207,11 +210,15 @@ where
    }

    async fn finish(mut self) -> anyhow::Result<()> {
-        if self.current_segment.is_none() || self.buf.is_empty() {
-            return Ok(());
-        }
+        let res = if self.current_segment.is_none() || self.buf.is_empty() {
+            Ok(())
+        } else {
+            self.flush().await
+        };

-        self.flush().await
+        info!("Collected {} SLRU blocks", self.total_blocks);
+
+        res
    }
 }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -34,6 +34,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
+use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -87,6 +88,10 @@ pub mod defaults {

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
+
    ///
    /// Default built-in configuration file.
    ///
@@ -126,6 +131,10 @@ pub mod defaults {

 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'

+#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
+
+#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -262,6 +271,10 @@ pub struct PageServerConf {
    pub virtual_file_io_engine: virtual_file::IoEngineKind,

    pub get_vectored_impl: GetVectoredImpl,
+
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+
+    pub validate_vectored_get: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -350,6 +363,10 @@ struct PageServerConfigBuilder {
    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,

    get_vectored_impl: BuilderValue<GetVectoredImpl>,
+
+    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
+
+    validate_vectored_get: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -429,6 +446,10 @@ impl Default for PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
        }
    }
 }
@@ -593,6 +614,14 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }

+    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
+        self.max_vectored_read_bytes = BuilderValue::Set(value);
+    }
+
+    pub fn get_validate_vectored_get(&mut self, value: bool) {
+        self.validate_vectored_get = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -706,6 +735,12 @@ impl PageServerConfigBuilder {
            get_vectored_impl: self
                .get_vectored_impl
                .ok_or(anyhow!("missing get_vectored_impl"))?,
+            max_vectored_read_bytes: self
+                .max_vectored_read_bytes
+                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
+            validate_vectored_get: self
+                .validate_vectored_get
+                .ok_or(anyhow!("missing validate_vectored_get"))?,
        })
    }
 }
@@ -952,6 +987,15 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
+                "max_vectored_read_bytes" => {
+                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
+                    builder.get_max_vectored_read_bytes(
+                        MaxVectoredReadBytes(
+                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
+                }
+                "validate_vectored_get" => {
+                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1027,6 +1071,11 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            max_vectored_read_bytes: MaxVectoredReadBytes(
+                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                    .expect("Invalid default constant"),
+            ),
+            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
        }
    }
 }
@@ -1261,6 +1310,11 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1326,6 +1380,11 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -17,7 +17,7 @@ use tracing::*;
 use utils::id::NodeId;

 mod metrics;
-use metrics::MetricsKey;
+use crate::consumption_metrics::metrics::MetricsKey;
 mod disk_cache;
 mod upload;

--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,10 +2,10 @@ use std::collections::HashMap;

 use futures::Future;
 use pageserver_api::{
-    control_api::{
+    shard::TenantShardId,
+    upcall_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
    },
-    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -661,9 +661,14 @@ async fn timeline_detail_handler(

    // Logical size calculation needs downloading.
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let state = get_state(&request);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -696,6 +701,7 @@ async fn get_lsn_by_timestamp_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
@@ -712,7 +718,10 @@ async fn get_lsn_by_timestamp_handler(
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
@@ -743,6 +752,7 @@ async fn get_timestamp_of_lsn_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
@@ -759,7 +769,9 @@ async fn get_timestamp_of_lsn_handler(
        .map_err(ApiError::BadRequest)?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
@@ -1159,10 +1171,13 @@ async fn layer_map_info_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
        parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
+    let state = get_state(&request);

    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
@@ -1176,8 +1191,11 @@ async fn layer_download_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let downloaded = timeline
        .download_layer(layer_file_name)
        .await
@@ -1201,8 +1219,11 @@ async fn evict_timeline_layer_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
+    let state = get_state(&request);

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let evicted = timeline
        .evict_layer(layer_file_name)
        .await
@@ -1612,13 +1633,19 @@ async fn timeline_compact_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let state = get_state(&request);
+
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
@@ -1638,13 +1665,19 @@ async fn timeline_checkpoint_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let state = get_state(&request);
+
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        timeline
            .freeze_and_flush()
            .await
@@ -1669,7 +1702,11 @@ async fn timeline_download_remote_layers_handler_post(
    let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    match timeline.spawn_download_all_remote_layers(body).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1683,8 +1720,11 @@ async fn timeline_download_remote_layers_handler_get(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
@@ -1733,6 +1773,7 @@ async fn getpage_at_lsn_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    struct Key(crate::repository::Key);

@@ -1751,7 +1792,7 @@ async fn getpage_at_lsn_handler(

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;

        let page = timeline.get(key.0, lsn, &ctx).await?;

@@ -1774,12 +1815,13 @@ async fn timeline_collect_keyspace(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -1795,10 +1837,14 @@ async fn timeline_collect_keyspace(
 }

 async fn active_timeline_of_active_tenant(
+    tenant_manager: &TenantManager,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,7 +15,6 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -36,6 +35,8 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

+const MAX_AUX_FILE_DELTAS: usize = 1024;
+
 #[derive(Debug)]
 pub enum LsnForTimestamp {
    /// Found commits both before and after the given timestamp
@@ -157,7 +158,6 @@ impl Timeline {
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
-            pending_aux_files: None,
            pending_directory_entries: Vec::new(),
            lsn,
        }
@@ -873,11 +873,6 @@ pub struct DatadirModification<'a> {
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

-    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
-    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
-    // if AUX_FILES_KEY is already set.
-    pending_aux_files: Option<AuxFilesDirectory>,
-
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1401,19 +1396,28 @@ impl<'a> DatadirModification<'a> {
            Some(Bytes::copy_from_slice(content))
        };

-        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+        let n_files;
+        let mut aux_files = self.tline.aux_files.lock().await;
+        if let Some(mut dir) = aux_files.dir.take() {
            // We already updated aux files in `self`: emit a delta and update our latest value
-
-            self.put(
-                AUX_FILES_KEY,
-                Value::WalRecord(NeonWalRecord::AuxFile {
-                    file_path: file_path.clone(),
-                    content: content.clone(),
-                }),
-            );
-
-            dir.upsert(file_path, content);
-            dir
+            dir.upsert(file_path.clone(), content.clone());
+            n_files = dir.files.len();
+            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::Image(Bytes::from(
+                        AuxFilesDirectory::ser(&dir).context("serialize")?,
+                    )),
+                );
+                aux_files.n_deltas = 0;
+            } else {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
+                );
+                aux_files.n_deltas += 1;
+            }
+            aux_files.dir = Some(dir);
        } else {
            // Check if the AUX_FILES_KEY is initialized
            match self.get(AUX_FILES_KEY, ctx).await {
@@ -1428,7 +1432,8 @@ impl<'a> DatadirModification<'a> {
                        }),
                    );
                    dir.upsert(file_path, content);
-                    dir
+                    n_files = dir.files.len();
+                    aux_files.dir = Some(dir);
                }
                Err(
                    e @ (PageReconstructError::AncestorStopping(_)
@@ -1455,14 +1460,14 @@ impl<'a> DatadirModification<'a> {
                            AuxFilesDirectory::ser(&dir).context("serialize")?,
                        )),
                    );
-                    dir
+                    n_files = 1;
+                    aux_files.dir = Some(dir);
                }
            }
-        };
+        }

        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, dir.files.len()));
-        self.pending_aux_files = Some(dir);
+            .push((DirectoryKind::AuxFiles, n_files));

        Ok(())
    }
@@ -1493,7 +1498,7 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1532,23 +1537,13 @@ impl<'a> DatadirModification<'a> {
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;

        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
-
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
-                .into_iter()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
-
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(&self.pending_updates, ctx).await?;
            self.pending_updates.clear();
        }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -145,6 +145,7 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
+pub mod vectored_blob_io;

 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
@@ -3461,9 +3462,8 @@ impl Tenant {
            // Run each timeline's flush in a task holding the timeline's gate: this
            // means that if this function's future is cancelled, the Timeline shutdown
            // will still wait for any I/O in here to complete.
-            let gate = match timeline.gate.enter() {
-                Ok(g) => g,
-                Err(_) => continue,
+            let Ok(gate) = timeline.gate.enter() else {
+                continue;
            };
            let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
            results.push(jh);
@@ -3633,6 +3633,7 @@ pub(crate) mod harness {
                compaction_target_size: Some(tenant_conf.compaction_target_size),
                compaction_period: Some(tenant_conf.compaction_period),
                compaction_threshold: Some(tenant_conf.compaction_threshold),
+                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                gc_horizon: Some(tenant_conf.gc_horizon),
                gc_period: Some(tenant_conf.gc_period),
                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
@@ -3852,7 +3853,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3864,7 +3865,7 @@ mod tests {
        writer.finish_write(Lsn(0x10));
        drop(writer);

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3930,7 +3931,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;

        #[allow(non_snake_case)]
        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -3964,7 +3965,7 @@ mod tests {
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
-        let mut new_writer = newtline.writer().await;
+        let new_writer = newtline.writer().await;
        new_writer
            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
            .await?;
@@ -3996,7 +3997,7 @@ mod tests {
    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            // Create a relation on the timeline
            writer
                .put(
@@ -4021,7 +4022,7 @@ mod tests {
        }
        tline.freeze_and_flush().await?;
        {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    *TEST_KEY,
@@ -4384,7 +4385,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4401,7 +4402,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4418,7 +4419,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4435,7 +4436,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4492,7 +4493,7 @@ mod tests {
        for _ in 0..repeat {
            for _ in 0..key_count {
                test_key.field6 = blknum;
-                let mut writer = timeline.writer().await;
+                let writer = timeline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4663,7 +4664,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4684,7 +4685,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4752,7 +4753,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4781,7 +4782,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4858,7 +4859,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,7 +5,7 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReader(&'a FileBlockReader),
+    FileBlockReader(&'a FileBlockReader<'a>),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
@@ -160,17 +160,15 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader {
-    pub file: VirtualFile,
+pub struct FileBlockReader<'a> {
+    pub file: &'a VirtualFile,

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
 }

-impl FileBlockReader {
-    pub fn new(file: VirtualFile) -> Self {
-        let file_id = page_cache::next_file_id();
-
+impl<'a> FileBlockReader<'a> {
+    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
        FileBlockReader { file_id, file }
    }

@@ -190,11 +188,11 @@ impl FileBlockReader {
    /// Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(
+    pub async fn read_blk<'b>(
        &self,
        blknum: u32,
        ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+    ) -> Result<BlockLease<'b>, std::io::Error> {
        let cache = page_cache::get();
        match cache
            .read_immutable_buf(self.file_id, blknum, ctx)
@@ -215,7 +213,7 @@ impl FileBlockReader {
    }
 }

-impl BlockReader for FileBlockReader {
+impl BlockReader for FileBlockReader<'_> {
    fn block_cursor(&self) -> BlockCursor<'_> {
        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,6 +9,7 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -20,6 +21,7 @@ use std::time::Duration;
 use utils::generation::Generation;

 pub mod defaults {
+
    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
    // would be more appropriate. But a low value forces the code to be exercised more,
    // which is good for now to trigger bugs.
@@ -27,12 +29,17 @@ pub mod defaults {
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";

+    // FIXME the below configs are only used by legacy algorithm. The new algorithm
+    // has different parameters.
+
    // Target file size, when creating image and delta layers.
    // This parameter determines L1 layer file size.
    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
+        super::CompactionAlgorithm::Legacy;

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

@@ -305,6 +312,7 @@ pub struct TenantConf {
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
+    pub compaction_algorithm: CompactionAlgorithm,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -377,6 +385,10 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub compaction_threshold: Option<usize>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub gc_horizon: Option<u64>,
@@ -457,6 +469,9 @@ impl TenantConfOpt {
            compaction_threshold: self
                .compaction_threshold
                .unwrap_or(global_conf.compaction_threshold),
+            compaction_algorithm: self
+                .compaction_algorithm
+                .unwrap_or(global_conf.compaction_algorithm),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -503,6 +518,7 @@ impl Default for TenantConf {
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
@@ -580,6 +596,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
        Self {
            checkpoint_distance: value.checkpoint_distance,
            checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            compaction_algorithm: value.compaction_algorithm,
            compaction_target_size: value.compaction_target_size,
            compaction_period: value.compaction_period.map(humantime),
            compaction_threshold: value.compaction_threshold,
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -373,12 +373,9 @@ async fn upload_tenant_heatmap(
    // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
    // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
    // in remote storage.
-    let _guard = match tenant.gate.enter() {
-        Ok(g) => g,
-        Err(_) => {
-            tracing::info!("Skipping heatmap upload for tenant which is shutting down");
-            return Err(UploadHeatmapError::Cancelled);
-        }
+    let Ok(_guard) = tenant.gate.enter() else {
+        tracing::info!("Skipping heatmap upload for tenant which is shutting down");
+        return Err(UploadHeatmapError::Cancelled);
    };

    for (timeline_id, timeline) in timelines {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -209,8 +209,7 @@ impl Default for ValuesReconstructState {
 pub(crate) enum ReadableLayerDesc {
    Persistent {
        desc: PersistentLayerDesc,
-        lsn_floor: Lsn,
-        lsn_ceil: Lsn,
+        lsn_range: Range<Lsn>,
    },
    InMemory {
        handle: InMemoryLayerHandle,
@@ -309,14 +308,14 @@ impl Eq for ReadableLayerDescOrdered {}
 impl ReadableLayerDesc {
    pub(crate) fn get_lsn_floor(&self) -> Lsn {
        match self {
-            ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
        }
    }

    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
        match self {
-            ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
        }
    }
@@ -329,10 +328,15 @@ impl ReadableLayerDesc {
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        match self {
-            ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
+            ReadableLayerDesc::Persistent { desc, lsn_range } => {
                let layer = layer_manager.get_from_desc(desc);
                layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(
+                        keyspace,
+                        lsn_range.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
                    .await
            }
            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,25 +29,28 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -63,8 +66,7 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
-    ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
 };

 ///
@@ -214,8 +216,10 @@ pub struct DeltaLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

 impl std::fmt::Debug for DeltaLayerInner {
@@ -297,7 +301,7 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, ctx)
+        let loaded = DeltaLayerInner::load(&path, None, None, ctx)
            .await
            .and_then(|res| res)?;

@@ -665,16 +669,16 @@ impl DeltaLayer {
    where
        F: Fn(Summary) -> Summary,
    {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
            path,
            virtual_file::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
        if actual_summary.magic != DELTA_FILE_MAGIC {
            return Err(RewriteSummaryError::MagicMismatch);
        }
@@ -698,15 +702,18 @@ impl DeltaLayerInner {
    pub(super) async fn load(
        path: &Utf8Path,
        summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
        let file = match VirtualFile::open(path).await {
            Ok(file) => file,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
-        let file = FileBlockReader::new(file);
+        let file_id = page_cache::next_file_id();

-        let summary_blk = match file.read_blk(0, ctx).await {
+        let block_reader = FileBlockReader::new(&file, file_id);
+
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
            Ok(blk) => blk,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
        };
@@ -730,8 +737,10 @@ impl DeltaLayerInner {

        Ok(Ok(DeltaLayerInner {
            file,
+            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
+            max_vectored_read_bytes,
        }))
    }

@@ -744,11 +753,11 @@ impl DeltaLayerInner {
    ) -> anyhow::Result<ValueReconstructResult> {
        let mut need_image = true;
        // Scan the page versions backwards, starting from `lsn`.
-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            &block_reader,
        );
        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));

@@ -782,19 +791,19 @@ impl DeltaLayerInner {
            .build();

        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = file.block_cursor();
+        let cursor = block_reader.block_cursor();
        let mut buf = Vec::new();
        for (entry_lsn, pos) in offsets {
            cursor
                .read_blob_into_buf(pos, &mut buf, ctx)
                .await
                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", file.file.path)
+                    format!("Failed to read blob from virtual file {}", self.file.path)
                })?;
            let val = Value::des(&buf).with_context(|| {
                format!(
                    "Failed to deserialize file blob from virtual file {}",
-                    file.file.path
+                    self.file.path
                )
            })?;
            match val {
@@ -834,133 +843,181 @@ impl DeltaLayerInner {
    pub(super) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
+        let reads = self
+            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;
+
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            block_reader,
        );

-        let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
-
        for range in keyspace.ranges.iter() {
-            let mut ignore_key = None;
+            let mut range_end_handled = false;

-            // Scan the page versions backwards, starting from the last key in the range.
-            // to collect all the offsets at which need to be read.
-            let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
+            let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
            tree_reader
                .visit(
-                    &end_key.0,
-                    VisitDirection::Backwards,
+                    &start_key.0,
+                    VisitDirection::Forwards,
                    |raw_key, value| {
                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-
-                        if entry_lsn >= end_lsn {
-                            return true;
-                        }
-
-                        if key < range.start {
-                            return false;
-                        }
-
-                        if key >= range.end {
-                            return true;
-                        }
-
-                        if Some(key) == ignore_key {
-                            return true;
-                        }
-
-                        if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
-                            if entry_lsn <= cached_lsn {
-                                return key != range.start;
-                            }
-                        }
-
+                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
                        let blob_ref = BlobRef(value);
-                        let lsns_at = offsets.entry(key).or_default();
-                        lsns_at.push((entry_lsn, blob_ref.pos()));

-                        if blob_ref.will_init() {
-                            if key == range.start {
-                                return false;
+                        assert!(key >= range.start && lsn >= lsn_range.start);
+
+                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
+                        let flag = {
+                            if cached_lsn >= Some(lsn) {
+                                BlobFlag::Ignore
+                            } else if blob_ref.will_init() {
+                                BlobFlag::Replaces
                            } else {
-                                ignore_key = Some(key);
-                                return true;
+                                BlobFlag::None
                            }
-                        }
+                        };

-                        true
+                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                            planner.handle_range_end(blob_ref.pos());
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, lsn, blob_ref.pos(), flag);
+                            true
+                        }
                    },
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
                        .build(),
                )
                .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }
+                .map_err(|err| anyhow!(err))?;

-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, lsns_at) in offsets {
-            for (lsn, block_offset) in lsns_at {
-                let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
-
-                if let Err(e) = res {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to read blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let value = Value::des(&buf);
-                if let Err(e) = value {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to deserialize file blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
-                if key_situation == ValueReconstructSituation::Complete {
-                    break;
-                }
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                tracing::info!("Handling range end fallback at {}", payload_end);
+                planner.handle_range_end(payload_end);
            }
        }

-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut ignore_key_with_err = None;
+
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+        // Note that reads are processed in reverse order (from highest key+lsn).
+        // This is the order that `ReconstructState` requires such that it can
+        // track when a key is done.
+        for read in reads.into_iter().rev() {
+            let res = vectored_blob_reader
+                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .await;
+
+            let blobs_buf = match res {
+                Ok(blobs_buf) => blobs_buf,
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+
+                    // We have "lost" the buffer since the lower level IO api
+                    // doesn't return the buffer on error. Allocate a new one.
+                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+                    continue;
+                }
+            };
+
+            for meta in blobs_buf.blobs.iter().rev() {
+                if Some(meta.meta.key) == ignore_key_with_err {
+                    continue;
+                }
+
+                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
+                let value = match value {
+                    Ok(v) => v,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::from(anyhow!(e).context(format!(
+                                "Failed to deserialize blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
+                // state, no further updates shall be made to it. The call below will
+                // panic if the invariant is violated.
+                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
+            }
+
+            buf = Some(blobs_buf.buf);
+        }
    }

    pub(super) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let file = &self.file;
-
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            block_reader,
        );

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
@@ -1012,11 +1069,11 @@ impl DeltaLayerInner {
            self.index_start_blk, self.index_root_blk
        );

-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            block_reader,
        );

        tree_reader.dump().await?;
@@ -1111,7 +1168,8 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        blknum: u32,
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum, ctx).await
+        let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id);
+        block_reader.read_blk(blknum, ctx).await
    }
 }

@@ -1120,3 +1178,15 @@ impl AsRef<DeltaLayerInner> for DeltaLayerInner {
        self
    }
 }
+
+impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> {
+    fn key(&self) -> Key {
+        self.key
+    }
+    fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+    fn size(&self) -> u64 {
+        self.size
+    }
+}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,7 +25,7 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
@@ -34,11 +34,14 @@ use crate::tenant::storage_layer::{
    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use pageserver_api::keyspace::KeySpace;
@@ -152,8 +155,10 @@ pub struct ImageLayerInner {

    lsn: Lsn,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

 impl std::fmt::Debug for ImageLayerInner {
@@ -167,9 +172,12 @@ impl std::fmt::Debug for ImageLayerInner {

 impl ImageLayerInner {
    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let file = &self.file;
-        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );

        tree_reader.dump().await?;

@@ -252,7 +260,7 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
            .await
            .and_then(|res| res)?;

@@ -327,16 +335,16 @@ impl ImageLayer {
    where
        F: Fn(Summary) -> Summary,
    {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
            path,
            virtual_file::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
        if actual_summary.magic != IMAGE_FILE_MAGIC {
            return Err(RewriteSummaryError::MagicMismatch);
        }
@@ -361,14 +369,16 @@ impl ImageLayerInner {
        path: &Utf8Path,
        lsn: Lsn,
        summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
        let file = match VirtualFile::open(path).await {
            Ok(file) => file,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
-        let file = FileBlockReader::new(file);
-        let summary_blk = match file.read_blk(0, ctx).await {
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
            Ok(blk) => blk,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
        };
@@ -399,6 +409,8 @@ impl ImageLayerInner {
            index_root_blk: actual_summary.index_root_blk,
            lsn,
            file,
+            file_id,
+            max_vectored_read_bytes,
        }))
    }

@@ -408,8 +420,9 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
@@ -422,7 +435,7 @@ impl ImageLayerInner {
            )
            .await?
        {
-            let blob = file
+            let blob = block_reader
                .block_cursor()
                .read_blob(
                    offset,
@@ -449,12 +462,36 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let reads = self
+            .plan_reads(keyspace, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;

-        let mut offsets = Vec::new();
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);

        for range in keyspace.ranges.iter() {
+            let mut range_end_handled = false;
+
            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

@@ -462,17 +499,18 @@ impl ImageLayerInner {
                .visit(
                    &search_key,
                    VisitDirection::Forwards,
-                    |raw_key, value| {
+                    |raw_key, offset| {
                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                        assert!(key >= range.start);

-                        if !range.contains(&key) {
-                            return false;
+                        if key >= range.end {
+                            planner.handle_range_end(offset);
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, self.lsn, offset, BlobFlag::None);
+                            true
                        }
-
-                        offsets.push((key, value));
-
-                        true
                    },
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
@@ -480,33 +518,60 @@ impl ImageLayerInner {
                )
                .await
                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }

-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::ImageLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, offset) in offsets {
-            let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
-            if let Err(e) = res {
-                reconstruct_state.on_key_error(
-                    key,
-                    PageReconstructError::from(anyhow!(e).context(format!(
-                        "Failed to read blob from virtual file {}",
-                        file.file.path
-                    ))),
-                );
-
-                continue;
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                planner.handle_range_end(payload_end);
            }
-
-            let blob = Bytes::copy_from_slice(buf.as_slice());
-            reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
        }

-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        for read in reads.into_iter() {
+            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
+            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+
+            match res {
+                Ok(blobs_buf) => {
+                    let frozen_buf = blobs_buf.buf.freeze();
+
+                    for meta in blobs_buf.blobs.iter() {
+                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        reconstruct_state.update_key(
+                            &meta.meta.key,
+                            self.lsn,
+                            Value::Image(img_buf),
+                        );
+                    }
+                }
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+                }
+            };
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -336,17 +336,32 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-
    pub(crate) async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
    }

    async fn put_value_locked(
@@ -354,16 +369,22 @@ impl InMemoryLayer {
        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
        key: Key,
        lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
        ctx: &RequestContext,
    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);

        let off = {
+            // Avoid doing allocations for "small" values.
+            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+            buf.clear();
+            val.ser_into(&mut buf)?;
            locked_inner
                .file
                .write_blob(
-                    buf,
+                    &buf,
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::InMemoryLayer)
                        .build(),
@@ -391,12 +412,7 @@ impl InMemoryLayer {
    pub async fn freeze(&self, end_lsn: Lsn) {
        let inner = self.inner.write().await;

-        assert!(
-            self.start_lsn < end_lsn,
-            "{} >= {}",
-            self.start_lsn,
-            end_lsn
-        );
+        assert!(self.start_lsn < end_lsn);
        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

        for vec_map in inner.index.values() {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -29,6 +29,9 @@ use super::{

 use utils::generation::Generation;

+#[cfg(test)]
+mod tests;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -267,7 +270,7 @@ impl Layer {
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_data: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
@@ -282,7 +285,7 @@ impl Layer {
            .record_access(LayerAccessKind::GetValueReconstructData, ctx);

        layer
-            .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
+            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
            .await
    }
@@ -1049,16 +1052,10 @@ impl LayerInner {

    /// `DownloadedLayer` is being dropped, so it calls this method.
    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let delete = self.wanted_deleted.load(Ordering::Acquire);
        let evict = self.wanted_evicted.load(Ordering::Acquire);
        let can_evict = self.have_remote_client;

-        if delete {
-            // do nothing now, only in LayerInner::drop -- this was originally implemented because
-            // we could had already scheduled the deletion at the time.
-            //
-            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
-        } else if can_evict && evict {
+        if can_evict && evict {
            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);

            // downgrade for queueing, in case there's a tear down already ongoing we should not
@@ -1299,9 +1296,14 @@ impl DownloadedLayer {
                    owner.desc.key_range.clone(),
                    owner.desc.lsn_range.clone(),
                ));
-                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Delta))
+                delta_layer::DeltaLayerInner::load(
+                    &owner.path,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
@@ -1310,9 +1312,15 @@ impl DownloadedLayer {
                    owner.desc.key_range.clone(),
                    lsn,
                ));
-                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Image))
+                image_layer::ImageLayerInner::load(
+                    &owner.path,
+                    lsn,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Image))
            };

            match res {
@@ -1365,7 +1373,7 @@ impl DownloadedLayer {
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_data: &mut ValuesReconstructState,
        owner: &Arc<LayerInner>,
        ctx: &RequestContext,
@@ -1374,7 +1382,7 @@ impl DownloadedLayer {

        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
            Delta(d) => {
-                d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
+                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
                    .await
            }
            Image(i) => {
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -0,0 +1,263 @@
+use futures::StreamExt;
+use tokio::task::JoinSet;
+use utils::{
+    completion::{self, Completion},
+    id::TimelineId,
+};
+
+use super::*;
+use crate::task_mgr::BACKGROUND_RUNTIME;
+use crate::tenant::harness::TenantHarness;
+
+/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
+/// time. Now both of them complete per Arc drop semantics.
+#[tokio::test(start_paused = true)]
+async fn evict_and_wait_on_wanted_deleted() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // setup done
+
+    let resident = layer.keep_resident().await.unwrap();
+
+    {
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+
+        layer.delete_on_drop();
+
+        drop(resident);
+
+        // make sure the eviction task gets to run
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+        let resident = layer.keep_resident().await;
+        assert!(
+            matches!(resident, Ok(None)),
+            "keep_resident should not have re-initialized: {resident:?}"
+        );
+
+        evict_and_wait
+            .await
+            .expect("evict_and_wait should had succeeded");
+
+        // works as intended
+    }
+
+    // assert that once we remove the `layer` from the layer map and drop our reference,
+    // the deletion of the layer in remote_storage happens.
+    {
+        let mut layers = timeline.layers.write().await;
+        layers.finish_gc_timeline(&[layer]);
+    }
+
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+}
+
+/// This test shows that ensures we are able to read the layer while the layer eviction has been
+/// started but not completed due to spawn_blocking pool being blocked.
+///
+/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
+#[tokio::test(start_paused = true)]
+async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+        .unwrap();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // setup done
+
+    let resident = layer.keep_resident().await.unwrap();
+
+    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+
+    // drive the future to await on the status channel
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        .await
+        .expect_err("should had been a timeout since we are holding the layer resident");
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+    // clog up BACKGROUND_RUNTIME spawn_blocking
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+    // now the eviction cannot proceed because the threads are consumed while completion exists
+    drop(resident);
+
+    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+    layer
+        .keep_resident()
+        .await
+        .expect("keep_resident should had reinitialized without downloading")
+        .expect("ResidentLayer");
+
+    // because the keep_resident check alters wanted evicted without sending a message, we will
+    // never get completed
+    let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        .await
+        .expect("no timeout, because keep_resident re-initialized")
+        .expect_err("eviction should not have succeeded because re-initialized");
+
+    // works as intended: evictions lose to "downloads"
+    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // this is not wrong: the eviction is technically still "on the way" as it's still queued
+    // because spawn_blocking is clogged up
+    assert_eq!(
+        0,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
+
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
+
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+        .await
+        .expect_err("timeout because spawn_blocking is clogged");
+
+    // in this case we don't leak started evictions, but I think there is still a chance of that
+    // happening, because we could have upgrades race multiple evictions while only one of them
+    // happens?
+    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+
+    helper.release().await;
+
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+        .await
+        .expect("eviction goes through now that spawn_blocking is unclogged")
+        .expect("eviction should succeed, because version matches");
+
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // now we finally can observe the original spawn_blocking failing
+    // it would had been possible to observe it earlier, but here it is guaranteed to have
+    // happened.
+    assert_eq!(
+        1,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
+}
+
+struct SpawnBlockingPoolHelper {
+    awaited_by_spawn_blocking_tasks: Completion,
+    blocking_tasks: JoinSet<()>,
+}
+
+impl SpawnBlockingPoolHelper {
+    /// All `crate::task_mgr::BACKGROUND_RUNTIME` spawn_blocking threads will be consumed until
+    /// release is called.
+    ///
+    /// In the tests this can be used to ensure something cannot be started on the target runtimes
+    /// spawn_blocking pool.
+    ///
+    /// This should be no issue nowdays, because nextest runs each test in it's own process.
+    async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
+        let (completion, barrier) = completion::channel();
+        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+
+        let assumed_max_blocking_threads = 512;
+
+        let mut blocking_tasks = JoinSet::new();
+
+        for _ in 0..assumed_max_blocking_threads {
+            let barrier = barrier.clone();
+            let tx = tx.clone();
+            blocking_tasks.spawn_blocking_on(
+                move || {
+                    tx.blocking_send(()).unwrap();
+                    drop(tx);
+                    tokio::runtime::Handle::current().block_on(barrier.wait());
+                },
+                handle,
+            );
+        }
+
+        drop(barrier);
+
+        for _ in 0..assumed_max_blocking_threads {
+            rx.recv().await.unwrap();
+        }
+
+        SpawnBlockingPoolHelper {
+            awaited_by_spawn_blocking_tasks: completion,
+            blocking_tasks,
+        }
+    }
+
+    /// Release all previously blocked spawn_blocking threads
+    async fn release(self) {
+        let SpawnBlockingPoolHelper {
+            awaited_by_spawn_blocking_tasks,
+            mut blocking_tasks,
+        } = self;
+
+        drop(awaited_by_spawn_blocking_tasks);
+
+        while let Some(res) = blocking_tasks.join_next().await {
+            res.expect("none of the tasks should had panicked");
+        }
+    }
+
+    /// In the tests it is used as an easy way of making sure something scheduled on the target
+    /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
+    /// before our tasks have a chance to schedule and complete.
+    async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
+        Self::consume_all_spawn_blocking_threads(handle)
+            .await
+            .release()
+            .await
+    }
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+mod compaction;
 pub mod delete;
 mod eviction_task;
 mod init;
@@ -18,23 +19,14 @@ use once_cell::sync::Lazy;
 use pageserver_api::{
    keyspace::KeySpaceAccum,
    models::{
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        LayerMapInfo, TimelineState,
+        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
+        EvictionPolicy, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
    shard::{ShardIdentity, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
-use storage_broker::BrokerClientChannel;
-use tokio::{
-    runtime::Handle,
-    sync::{oneshot, watch},
-};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-use utils::{bin_ser::BeSer, sync::gate::Gate};
-
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -49,8 +41,16 @@ use std::{
    cmp::{max, min, Ordering},
    ops::ControlFlow,
 };
+use storage_broker::BrokerClientChannel;
+use tokio::{
+    runtime::Handle,
+    sync::{oneshot, watch},
+};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::sync::gate::{Gate, GateGuard};

-use crate::pgdatadir_mapping::DirectoryKind;
+use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -60,6 +60,7 @@ use crate::tenant::{
 use crate::{
    context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
    disk_usage_eviction_task::DiskUsageEvictionInfo,
+    pgdatadir_mapping::CollectKeySpaceError,
 };
 use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
 use crate::{
@@ -169,6 +170,11 @@ pub struct TimelineResources {
    >,
 }

+pub(crate) struct AuxFilesState {
+    pub(crate) dir: Option<AuxFilesDirectory>,
+    pub(crate) n_deltas: usize,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
@@ -263,7 +269,7 @@ pub struct Timeline {
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
    /// Must always be acquired before the layer map/individual layer lock
    /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
+    write_lock: tokio::sync::Mutex<()>,

    /// Used to avoid multiple `flush_loop` tasks running
    pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -352,6 +358,9 @@ pub struct Timeline {
    timeline_get_throttle: Arc<
        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
    >,
+
+    /// Keep aux directory cache to avoid it's reconstruction on each update
+    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
 }

 pub struct WalReceiverInfo {
@@ -503,6 +512,7 @@ pub enum GetLogicalSizePriority {
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
    ForceRepartition,
+    ForceImageLayerCreation,
 }

 impl std::fmt::Debug for Timeline {
@@ -767,8 +777,10 @@ impl Timeline {
            GetVectoredImpl::Vectored => {
                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;

-                self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                    .await;
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }

                vectored_res
            }
@@ -1089,6 +1101,19 @@ impl Timeline {
            return Ok(());
        }

+        match self.get_compaction_algorithm() {
+            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
+            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
+        }
+    }
+
+    /// TODO: cancellation
+    async fn compact_legacy(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
        // High level strategy for compaction / image creation:
        //
        // 1. First, calculate the desired "partitioning" of the
@@ -1157,7 +1182,12 @@ impl Timeline {
                // 3. Create new image layers for partitions that have been modified
                // "enough".
                let layers = self
-                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
                    .await
                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
@@ -1193,10 +1223,58 @@ impl Timeline {
    pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
            tl: self,
-            write_guard: self.write_lock.lock().await,
+            _write_guard: self.write_lock.lock().await,
        }
    }

+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
+    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+        let open_layer_size = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            let Some(open_layer) = layers.open_layer.as_ref() else {
+                return Ok(());
+            };
+            open_layer.size().await?
+        };
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+        let distance = last_lsn.widening_sub(last_freeze_at);
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if (distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128)
+            || open_layer_size > self.get_checkpoint_distance()
+            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
+        {
+            info!(
+                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                distance,
+                open_layer_size,
+                last_freeze_ts.elapsed()
+            );
+
+            self.freeze_inmem_layer(true).await;
+            self.last_freeze_at.store(last_lsn);
+            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
+
+            // Wake up the layer flusher
+            self.flush_frozen_layers();
+        }
+        Ok(())
+    }
+
    pub(crate) fn activate(
        self: &Arc<Self>,
        broker_client: BrokerClientChannel,
@@ -1489,6 +1567,13 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

+    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        tenant_conf
+            .compaction_algorithm
+            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
+    }
+
    fn get_eviction_policy(&self) -> EvictionPolicy {
        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
@@ -1620,7 +1705,7 @@ impl Timeline {
                layer_flush_start_tx,
                layer_flush_done_tx,

-                write_lock: tokio::sync::Mutex::new(None),
+                write_lock: tokio::sync::Mutex::new(()),

                gc_info: std::sync::RwLock::new(GcInfo {
                    retain_lsns: Vec::new(),
@@ -1662,6 +1747,11 @@ impl Timeline {
                gc_lock: tokio::sync::Mutex::default(),

                timeline_get_throttle: resources.timeline_get_throttle,
+
+                aux_files: tokio::sync::Mutex::new(AuxFilesState {
+                    dir: None,
+                    n_deltas: 0,
+                }),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2282,14 +2372,17 @@ impl Timeline {
        // accurate relation sizes, and they do not emit consumption metrics.
        debug_assert!(self.tenant_shard_id.is_zero());

-        let _guard = self.gate.enter();
+        let guard = self
+            .gate
+            .enter()
+            .map_err(|_| CalculateLogicalSizeError::Cancelled)?;

        let self_calculation = Arc::clone(self);

        let mut calculation = pin!(async {
            let ctx = ctx.attached_child();
            self_calculation
-                .calculate_logical_size(lsn, cause, &ctx)
+                .calculate_logical_size(lsn, cause, &guard, &ctx)
                .await
        });

@@ -2318,33 +2411,16 @@ impl Timeline {
        &self,
        up_to_lsn: Lsn,
        cause: LogicalSizeCalculationCause,
+        _guard: &GateGuard,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        info!(
            "Calculating logical size for timeline {} at {}",
            self.timeline_id, up_to_lsn
        );
-        // These failpoints are used by python tests to ensure that we don't delete
-        // the timeline while the logical size computation is ongoing.
-        // The first failpoint is used to make this function pause.
-        // Then the python test initiates timeline delete operation in a thread.
-        // It waits for a few seconds, then arms the second failpoint and disables
-        // the first failpoint. The second failpoint prints an error if the timeline
-        // delete code has deleted the on-disk state while we're still running here.
-        // It shouldn't do that. If it does it anyway, the error will be caught
-        // by the test suite, highlighting the problem.
-        fail::fail_point!("timeline-calculate-logical-size-pause");
-        fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
-            if !self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id)
-                .exists()
-            {
-                error!("timeline-calculate-logical-size-pre metadata file does not exist")
-            }
-            // need to return something
-            Ok(0)
-        });
+
+        pausable_failpoint!("timeline-calculate-logical-size-pause");
+
        // See if we've already done the work for initial size calculation.
        // This is a short-cut for timelines that are mostly unused.
        if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
@@ -2818,8 +2894,7 @@ impl Timeline {
                                (
                                    ReadableLayerDesc::Persistent {
                                        desc: (*layer).clone(),
-                                        lsn_floor,
-                                        lsn_ceil: cont_lsn,
+                                        lsn_range: lsn_floor..cont_lsn,
                                    },
                                    keyspace_accum.to_keyspace(),
                                )
@@ -2961,6 +3036,43 @@ impl Timeline {
        Ok(layer)
    }

+    async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        //info!("PUT: key {} at {}", key, lsn);
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_value(key, lsn, val, ctx).await?;
+        Ok(())
+    }
+
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
+        Ok(())
+    }
+
    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());

@@ -2971,20 +3083,14 @@ impl Timeline {
    async fn freeze_inmem_layer(&self, write_lock_held: bool) {
        // Freeze the current open in-memory layer. It will be written to disk on next
        // iteration.
-
        let _write_guard = if write_lock_held {
            None
        } else {
            Some(self.write_lock.lock().await)
        };
-
-        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
-    }
-
-    async fn freeze_inmem_layer_at(&self, at: Lsn) {
        let mut guard = self.layers.write().await;
        guard
-            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
            .await;
    }

@@ -3644,6 +3750,18 @@ pub(crate) enum CompactionError {
    Other(#[from] anyhow::Error),
 }

+impl From<CollectKeySpaceError> for CompactionError {
+    fn from(err: CollectKeySpaceError) -> Self {
+        match err {
+            CollectKeySpaceError::Cancelled
+            | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => {
+                CompactionError::ShuttingDown
+            }
+            e => CompactionError::Other(e.into()),
+        }
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -3763,7 +3881,7 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
 }

 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment.
+    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
    async fn compact_level0_phase1(
        self: &Arc<Self>,
        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
@@ -4242,13 +4360,24 @@ impl Timeline {
            return Ok(());
        }

+        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
+            .await?;
+        Ok(())
+    }
+
+    async fn finish_compact_batch(
+        self: &Arc<Self>,
+        new_deltas: &[ResidentLayer],
+        new_images: &[ResidentLayer],
+        layers_to_remove: &[Layer],
+    ) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;

        let mut duplicated_layers = HashSet::new();

-        let mut insert_layers = Vec::with_capacity(new_layers.len());
+        let mut insert_layers = Vec::with_capacity(new_deltas.len());

-        for l in &new_layers {
+        for l in new_deltas {
            if guard.contains(l.as_ref()) {
                // expected in tests
                tracing::error!(layer=%l, "duplicated L1 layer");
@@ -4259,24 +4388,28 @@ impl Timeline {
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
            } else if LayerMap::is_l0(l.layer_desc()) {
-                return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
+                bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
            } else {
                insert_layers.push(l.clone());
            }
        }

-        let remove_layers = {
-            let mut deltas_to_compact = deltas_to_compact;
-            // only remove those inputs which were not outputs
-            deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key()));
-            deltas_to_compact
-        };
+        // only remove those inputs which were not outputs
+        let remove_layers: Vec<Layer> = layers_to_remove
+            .iter()
+            .filter(|l| !duplicated_layers.contains(&l.layer_desc().key()))
+            .cloned()
+            .collect();
+
+        if !new_images.is_empty() {
+            guard.track_new_image_layers(new_images, &self.metrics);
+        }

        // deletion will happen later, the layer file manager calls garbage_collect_on_drop
        guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);

        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, &new_layers)?;
+            remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
        }

        drop_wlock(guard);
@@ -4922,43 +5055,13 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
    PageReconstructError::from(msg)
 }

-struct TimelineWriterState {
-    open_layer: Arc<InMemoryLayer>,
-    current_size: u64,
-    // Previous Lsn which passed through
-    prev_lsn: Option<Lsn>,
-    // Largest Lsn which passed through the current writer
-    max_lsn: Option<Lsn>,
-    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
-    cached_last_freeze_at: Lsn,
-    cached_last_freeze_ts: Instant,
-}
-
-impl TimelineWriterState {
-    fn new(
-        open_layer: Arc<InMemoryLayer>,
-        current_size: u64,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> Self {
-        Self {
-            open_layer,
-            current_size,
-            prev_lsn: None,
-            max_lsn: None,
-            cached_last_freeze_at: last_freeze_at,
-            cached_last_freeze_ts: last_freeze_ts,
-        }
-    }
-}
-
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
    tl: &'a Timeline,
-    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
+    _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }

 impl Deref for TimelineWriter<'_> {
@@ -4969,193 +5072,31 @@ impl Deref for TimelineWriter<'_> {
    }
 }

-impl Drop for TimelineWriter<'_> {
-    fn drop(&mut self) {
-        self.write_guard.take();
-    }
-}
-
-enum OpenLayerAction {
-    Roll,
-    Open,
-    None,
-}
-
 impl<'a> TimelineWriter<'a> {
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
    pub(crate) async fn put(
-        &mut self,
+        &self,
        key: Key,
        lsn: Lsn,
        value: &Value,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        buf.clear();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
-        let res = layer.put_value(key, lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
+        self.tl.put_value(key, lsn, value, ctx).await
    }

-    async fn handle_open_layer_action(
-        &mut self,
-        at: Lsn,
-        action: OpenLayerAction,
-    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
-        match action {
-            OpenLayerAction::Roll => {
-                let max_lsn = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
-                self.tl.freeze_inmem_layer_at(max_lsn).await;
-
-                let now = Instant::now();
-                *(self.last_freeze_ts.write().unwrap()) = now;
-
-                self.tl.flush_frozen_layers();
-
-                let current_size = self.write_guard.as_ref().unwrap().current_size;
-                if current_size > self.get_checkpoint_distance() {
-                    warn!("Flushed oversized open layer with size {}", current_size)
-                }
-
-                assert!(self.write_guard.is_some());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    Lsn(max_lsn.0 + 1),
-                    now,
-                ));
-            }
-            OpenLayerAction::Open => {
-                assert!(self.write_guard.is_none());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-
-                let last_freeze_at = self.last_freeze_at.load();
-                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    last_freeze_at,
-                    last_freeze_ts,
-                ));
-            }
-            OpenLayerAction::None => {
-                assert!(self.write_guard.is_some());
-            }
-        }
-
-        Ok(&self.write_guard.as_ref().unwrap().open_layer)
-    }
-
-    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
-        let state = &*self.write_guard;
-        let Some(state) = &state else {
-            return OpenLayerAction::Open;
-        };
-
-        if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by downstream code.
-            // Hence, only roll at LSN boundaries.
-            return OpenLayerAction::None;
-        }
-
-        let distance = lsn.widening_sub(state.cached_last_freeze_at);
-        let proposed_open_layer_size = state.current_size + new_value_size;
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                lsn, state.current_size, distance
-            );
-
-            OpenLayerAction::Roll
-        } else if state.current_size > 0
-            && proposed_open_layer_size >= self.get_checkpoint_distance()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                lsn, state.current_size, proposed_open_layer_size
-            );
-
-            OpenLayerAction::Roll
-        } else if distance > 0
-            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                lsn,
-                state.current_size,
-                state.cached_last_freeze_ts.elapsed()
-            );
-
-            OpenLayerAction::Roll
-        } else {
-            OpenLayerAction::None
-        }
-    }
-
-    /// Put a batch keys at the specified Lsns.
-    ///
-    /// The batch should be sorted by Lsn such that it's safe
-    /// to roll the open layer mid batch.
    pub(crate) async fn put_batch(
-        &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        for (key, lsn, val) in batch {
-            self.put(key, lsn, &val, ctx).await?
-        }
-
-        Ok(())
+        self.tl.put_values(batch, ctx).await
    }

-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = batch.first() {
-            let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
-            layer.put_tombstones(batch).await?;
-        }
-
-        Ok(())
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
    }

    /// Track the end of the latest digested WAL record.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -0,0 +1,477 @@
+//! New compaction implementation. The algorithm itself is implemented in the
+//! compaction crate. This file implements the callbacks and structs that allow
+//! the algorithm to drive the process.
+//!
+//! The old legacy algorithm is implemented directly in `timeline.rs`.
+
+use std::ops::{Deref, Range};
+use std::sync::Arc;
+
+use super::Timeline;
+
+use async_trait::async_trait;
+use fail::fail_point;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, trace, warn};
+
+use crate::context::RequestContext;
+use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
+use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Layer, ResidentLayer};
+use crate::tenant::DeltaLayer;
+use crate::tenant::PageReconstructError;
+use crate::ZERO_PAGE;
+
+use crate::keyspace::KeySpace;
+use crate::repository::Key;
+
+use utils::lsn::Lsn;
+
+use pageserver_compaction::helpers::overlaps_with;
+use pageserver_compaction::interface::*;
+
+use super::CompactionError;
+
+impl Timeline {
+    /// Entry point for new tiered compaction algorithm.
+    ///
+    /// All the real work is in the implementation in the pageserver_compaction
+    /// crate. The code here would apply to any algorithm implemented by the
+    /// same interface, but tiered is the only one at the moment.
+    ///
+    /// TODO: cancellation
+    pub(crate) async fn compact_tiered(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        let fanout = self.get_compaction_threshold() as u64;
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Find the top of the historical layers
+        let end_lsn = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+
+            let l0_deltas = layers.get_level0_deltas()?;
+            drop(guard);
+
+            // As an optimization, if we find that there are too few L0 layers,
+            // bail out early. We know that the compaction algorithm would do
+            // nothing in that case.
+            if l0_deltas.len() < fanout as usize {
+                // doesn't need compacting
+                return Ok(());
+            }
+            l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap()
+        };
+
+        // Is the timeline being deleted?
+        if self.is_stopping() {
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
+        }
+
+        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let ctx_adaptor = RequestContextAdaptor(ctx.clone());
+
+        pageserver_compaction::compact_tiered::compact_tiered(
+            &mut adaptor,
+            end_lsn,
+            target_file_size,
+            fanout,
+            &ctx_adaptor,
+        )
+        .await?;
+
+        adaptor.flush_updates().await?;
+        Ok(())
+    }
+}
+
+struct TimelineAdaptor {
+    timeline: Arc<Timeline>,
+
+    keyspace: (Lsn, KeySpace),
+
+    new_deltas: Vec<ResidentLayer>,
+    new_images: Vec<ResidentLayer>,
+    layers_to_delete: Vec<Arc<PersistentLayerDesc>>,
+}
+
+impl TimelineAdaptor {
+    pub fn new(timeline: &Arc<Timeline>, keyspace: (Lsn, KeySpace)) -> Self {
+        Self {
+            timeline: timeline.clone(),
+            keyspace,
+            new_images: Vec::new(),
+            new_deltas: Vec::new(),
+            layers_to_delete: Vec::new(),
+        }
+    }
+
+    pub async fn flush_updates(&mut self) -> anyhow::Result<()> {
+        let layers_to_delete = {
+            let guard = self.timeline.layers.read().await;
+            self.layers_to_delete
+                .iter()
+                .map(|x| guard.get_from_desc(x))
+                .collect::<Vec<Layer>>()
+        };
+        self.timeline
+            .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
+            .await?;
+        self.new_images.clear();
+        self.new_deltas.clear();
+        self.layers_to_delete.clear();
+        Ok(())
+    }
+}
+
+#[derive(Clone)]
+struct ResidentDeltaLayer(ResidentLayer);
+#[derive(Clone)]
+struct ResidentImageLayer(ResidentLayer);
+
+#[async_trait]
+impl CompactionJobExecutor for TimelineAdaptor {
+    type Key = crate::repository::Key;
+
+    type Layer = OwnArc<PersistentLayerDesc>;
+    type DeltaLayer = ResidentDeltaLayer;
+    type ImageLayer = ResidentImageLayer;
+
+    type RequestContext = RequestContextAdaptor;
+
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Key>,
+        lsn_range: &Range<Lsn>,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
+        self.flush_updates().await?;
+
+        let guard = self.timeline.layers.read().await;
+        let layer_map = guard.layer_map();
+
+        let result = layer_map
+            .iter_historic_layers()
+            .filter(|l| {
+                overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range)
+            })
+            .map(OwnArc)
+            .collect();
+        Ok(result)
+    }
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<Range<Key>>> {
+        if lsn == self.keyspace.0 {
+            Ok(pageserver_compaction::helpers::intersect_keyspace(
+                &self.keyspace.1.ranges,
+                key_range,
+            ))
+        } else {
+            // The current compaction implementatin only ever requests the key space
+            // at the compaction end LSN.
+            anyhow::bail!("keyspace not available for requested lsn");
+        }
+    }
+
+    async fn downcast_delta_layer(
+        &self,
+        layer: &OwnArc<PersistentLayerDesc>,
+    ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
+        // this is a lot more complex than a simple downcast...
+        if layer.is_delta() {
+            let l = {
+                let guard = self.timeline.layers.read().await;
+                guard.get_from_desc(layer)
+            };
+            let result = l.download_and_keep_resident().await?;
+
+            Ok(Some(ResidentDeltaLayer(result)))
+        } else {
+            Ok(None)
+        }
+    }
+
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        Ok(self.create_image_impl(lsn, key_range, ctx).await?)
+    }
+
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Key>,
+        input_layers: &[ResidentDeltaLayer],
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+
+        let mut all_entries = Vec::new();
+        for dl in input_layers.iter() {
+            all_entries.extend(dl.load_keys(ctx).await?);
+        }
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+
+        let mut writer = DeltaLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            key_range.start,
+            lsn_range.clone(),
+        )
+        .await?;
+
+        let mut dup_values = 0;
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let mut prev: Option<(Key, Lsn)> = None;
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_entries.iter()
+        {
+            if prev == Some((key, lsn)) {
+                // This is a duplicate. Skip it.
+                //
+                // It can happen if compaction is interrupted after writing some
+                // layers but not all, and we are compacting the range again.
+                // The calculations in the algorithm assume that there are no
+                // duplicates, so the math on targeted file size is likely off,
+                // and we will create smaller files than expected.
+                dup_values += 1;
+                continue;
+            }
+
+            let value = val.load(ctx).await?;
+
+            writer.put_value(key, lsn, value).await?;
+
+            prev = Some((key, lsn));
+        }
+
+        if dup_values > 0 {
+            warn!("delta layer created with {} duplicate values", dup_values);
+        }
+
+        fail_point!("delta-layer-writer-fail-before-finish", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint delta-layer-writer-fail-before-finish"
+            ))
+        });
+
+        let new_delta_layer = writer
+            .finish(prev.unwrap().0.next(), &self.timeline)
+            .await?;
+
+        self.new_deltas.push(new_delta_layer);
+        Ok(())
+    }
+
+    async fn delete_layer(
+        &mut self,
+        layer: &OwnArc<PersistentLayerDesc>,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        self.layers_to_delete.push(layer.clone().0);
+        Ok(())
+    }
+}
+
+impl TimelineAdaptor {
+    async fn create_image_impl(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &RequestContextAdaptor,
+    ) -> Result<(), PageReconstructError> {
+        let timer = self.timeline.metrics.create_images_time_histo.start_timer();
+
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            key_range,
+            lsn,
+        )
+        .await?;
+
+        fail_point!("image-layer-writer-fail-before-finish", |_| {
+            Err(PageReconstructError::Other(anyhow::anyhow!(
+                "failpoint image-layer-writer-fail-before-finish"
+            )))
+        });
+        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
+        for range in &keyspace_ranges {
+            let mut key = range.start;
+            while key < range.end {
+                let img = match self.timeline.get(key, lsn, ctx).await {
+                    Ok(img) => img,
+                    Err(err) => {
+                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                        // page without losing any actual user data. That seems better
+                        // than failing repeatedly and getting stuck.
+                        //
+                        // We had a bug at one point, where we truncated the FSM and VM
+                        // in the pageserver, but the Postgres didn't know about that
+                        // and continued to generate incremental WAL records for pages
+                        // that didn't exist in the pageserver. Trying to replay those
+                        // WAL records failed to find the previous image of the page.
+                        // This special case allows us to recover from that situation.
+                        // See https://github.com/neondatabase/neon/issues/2601.
+                        //
+                        // Unfortunately we cannot do this for the main fork, or for
+                        // any metadata keys, keys, as that would lead to actual data
+                        // loss.
+                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
+                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
+                            ZERO_PAGE.clone()
+                        } else {
+                            return Err(err);
+                        }
+                    }
+                };
+                image_layer_writer.put_image(key, img).await?;
+                key = key.next();
+            }
+        }
+        let image_layer = image_layer_writer.finish(&self.timeline).await?;
+
+        self.new_images.push(image_layer);
+
+        timer.stop_and_record();
+
+        Ok(())
+    }
+}
+
+pub struct RequestContextAdaptor(pub RequestContext);
+
+impl std::ops::Deref for RequestContextAdaptor {
+    type Target = RequestContext;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl CompactionRequestContext for RequestContextAdaptor {}
+
+#[derive(Debug, Clone)]
+pub struct OwnArc<T>(pub Arc<T>);
+
+impl<T> Deref for OwnArc<T> {
+    type Target = <Arc<T> as Deref>::Target;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> AsRef<T> for OwnArc<T> {
+    fn as_ref(&self) -> &T {
+        self.0.as_ref()
+    }
+}
+
+impl CompactionLayer<Key> for OwnArc<PersistentLayerDesc> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.as_ref().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        self.as_ref().is_delta()
+    }
+}
+
+impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+use crate::tenant::timeline::DeltaEntry;
+
+impl CompactionLayer<Key> for ResidentDeltaLayer {
+    fn key_range(&self) -> &Range<Key> {
+        &self.0.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.0.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.0.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.0.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+#[async_trait]
+impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
+    type DeltaEntry<'a> = DeltaEntry<'a>;
+
+    async fn load_keys<'a>(
+        &self,
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+        self.0.load_keys(ctx).await
+    }
+}
+
+impl CompactionLayer<Key> for ResidentImageLayer {
+    fn key_range(&self) -> &Range<Key> {
+        &self.0.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.0.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.0.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.0.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        false
+    }
+}
+impl CompactionImageLayer<TimelineAdaptor> for ResidentImageLayer {}
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -34,7 +34,7 @@ use crate::{
    },
 };

-use utils::completion;
+use utils::{completion, sync::gate::GateGuard};

 use super::Timeline;

@@ -81,6 +81,12 @@ impl Timeline {
    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
        use crate::tenant::tasks::random_init_delay;
+
+        // acquire the gate guard only once within a useful span
+        let Ok(guard) = self.gate.enter() else {
+            return;
+        };
+
        {
            let policy = self.get_eviction_policy();
            let period = match policy {
@@ -96,7 +102,9 @@ impl Timeline {
        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
        loop {
            let policy = self.get_eviction_policy();
-            let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;
+            let cf = self
+                .eviction_iteration(&policy, &cancel, &guard, &ctx)
+                .await;

            match cf {
                ControlFlow::Break(()) => break,
@@ -117,6 +125,7 @@ impl Timeline {
        self: &Arc<Self>,
        policy: &EvictionPolicy,
        cancel: &CancellationToken,
+        gate: &GateGuard,
        ctx: &RequestContext,
    ) -> ControlFlow<(), Instant> {
        debug!("eviction iteration: {policy:?}");
@@ -127,14 +136,17 @@ impl Timeline {
                return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
            }
            EvictionPolicy::LayerAccessThreshold(p) => {
-                match self.eviction_iteration_threshold(p, cancel, ctx).await {
+                match self
+                    .eviction_iteration_threshold(p, cancel, gate, ctx)
+                    .await
+                {
                    ControlFlow::Break(()) => return ControlFlow::Break(()),
                    ControlFlow::Continue(()) => (),
                }
                (p.period, p.threshold)
            }
            EvictionPolicy::OnlyImitiate(p) => {
-                if self.imitiate_only(p, cancel, ctx).await.is_break() {
+                if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
                    return ControlFlow::Break(());
                }
                (p.period, p.threshold)
@@ -165,6 +177,7 @@ impl Timeline {
        self: &Arc<Self>,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
+        gate: &GateGuard,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let now = SystemTime::now();
@@ -180,7 +193,7 @@ impl Timeline {
            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

-        match self.imitate_layer_accesses(p, cancel, ctx).await {
+        match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
            ControlFlow::Break(()) => return ControlFlow::Break(()),
            ControlFlow::Continue(()) => (),
        }
@@ -302,6 +315,7 @@ impl Timeline {
        self: &Arc<Self>,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
+        gate: &GateGuard,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
@@ -315,7 +329,7 @@ impl Timeline {
            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

-        self.imitate_layer_accesses(p, cancel, ctx).await
+        self.imitate_layer_accesses(p, cancel, gate, ctx).await
    }

    /// If we evict layers but keep cached values derived from those layers, then
@@ -347,6 +361,7 @@ impl Timeline {
        &self,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
+        gate: &GateGuard,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        if !self.tenant_shard_id.is_zero() {
@@ -365,7 +380,7 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_timeline_cached_layer_accesses(ctx).await;
+                self.imitate_timeline_cached_layer_accesses(gate, ctx).await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now())
            }
        }
@@ -405,12 +420,21 @@ impl Timeline {

    /// Recompute the values which would cause on-demand downloads during restart.
    #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
+    async fn imitate_timeline_cached_layer_accesses(
+        &self,
+        guard: &GateGuard,
+        ctx: &RequestContext,
+    ) {
        let lsn = self.get_last_record_lsn();

        // imitiate on-restart initial logical size
        let size = self
-            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                guard,
+                ctx,
+            )
            .instrument(info_span!("calculate_logical_size"))
            .await;

--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
                            modification.commit(&ctx).await?;
                            uncommitted_records = 0;
                            filtered_records = 0;
+
+                            //
+                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
+                            // layer size can become much larger than `checkpoint_distance`.
+                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
+                            // amount of data to key-value storage. So performing this check only after processing
+                            // all WAL records in the chunk, can cause huge L0 layer files.
+                            //
+                            timeline
+                                .check_checkpoint_distance()
+                                .await
+                                .with_context(|| {
+                                    format!(
+                                        "Failed to check checkpoint distance for timeline {}",
+                                        timeline.timeline_id
+                                    )
+                                })?;
                        }
                    }

@@ -389,6 +406,16 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

+        timeline
+            .check_checkpoint_distance()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to check checkpoint distance for timeline {}",
+                    timeline.timeline_id
+                )
+            })?;
+
        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn = timeline
                .get_remote_consistent_lsn_visible()
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -0,0 +1,436 @@
+//!
+//! Utilities for vectored reading of variable-sized "blobs".
+//!
+//! The "blob" api is an abstraction on top of the "block" api,
+//! with the main difference being that blobs do not have a fixed
+//! size (each blob is prefixed with 1 or 4 byte length field)
+//!
+//! The vectored apis provided in this module allow for planning
+//! and executing disk IO which covers multiple blobs.
+//!
+//! Reads are planned with [`VectoredReadPlanner`] which will coalesce
+//! adjacent blocks into a single disk IO request and exectuted by
+//! [`VectoredBlobReader`] which does all the required offset juggling
+//! and returns a buffer housing all the blobs and a list of offsets.
+//!
+//! Note that the vectored blob api does *not* go through the page cache.
+
+use std::collections::BTreeMap;
+use std::num::NonZeroUsize;
+
+use bytes::BytesMut;
+use pageserver_api::key::Key;
+use utils::lsn::Lsn;
+use utils::vec_map::VecMap;
+
+use crate::virtual_file::VirtualFile;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct MaxVectoredReadBytes(pub NonZeroUsize);
+
+/// Metadata bundled with the start and end offset of a blob.
+#[derive(Copy, Clone, Debug)]
+pub struct BlobMeta {
+    pub key: Key,
+    pub lsn: Lsn,
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]
+pub struct VectoredBlob {
+    pub start: usize,
+    pub end: usize,
+    pub meta: BlobMeta,
+}
+
+/// Return type of [`VectoredBlobReader::read_blobs`]
+pub struct VectoredBlobsBuf {
+    /// Buffer for all blobs in this read
+    pub buf: BytesMut,
+    /// Offsets into the buffer and metadata for all blobs in this read
+    pub blobs: Vec<VectoredBlob>,
+}
+
+/// Description of one disk read for multiple blobs.
+/// Used as the argument form [`VectoredBlobReader::read_blobs`]
+#[derive(Debug)]
+pub struct VectoredRead {
+    pub start: u64,
+    pub end: u64,
+    /// Starting offsets and metadata for each blob in this read
+    pub blobs_at: VecMap<u64, BlobMeta>,
+}
+
+impl VectoredRead {
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+}
+
+#[derive(Eq, PartialEq)]
+enum VectoredReadExtended {
+    Yes,
+    No,
+}
+
+struct VectoredReadBuilder {
+    start: u64,
+    end: u64,
+    blobs_at: VecMap<u64, BlobMeta>,
+    max_read_size: usize,
+}
+
+impl VectoredReadBuilder {
+    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+        let mut blobs_at = VecMap::default();
+        blobs_at
+            .append(start_offset, meta)
+            .expect("First insertion always succeeds");
+
+        Self {
+            start: start_offset,
+            end: end_offset,
+            blobs_at,
+            max_read_size,
+        }
+    }
+
+    /// Attempt to extend the current read with a new blob if the start
+    /// offset matches with the current end of the vectored read
+    /// and the resuting size is below the max read size
+    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        let size = (end - start) as usize;
+        if self.end == start && self.size() + size <= self.max_read_size {
+            self.end = end;
+            self.blobs_at
+                .append(start, meta)
+                .expect("LSNs are ordered within vectored reads");
+
+            return VectoredReadExtended::Yes;
+        }
+
+        VectoredReadExtended::No
+    }
+
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+
+    fn build(self) -> VectoredRead {
+        VectoredRead {
+            start: self.start,
+            end: self.end,
+            blobs_at: self.blobs_at,
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum BlobFlag {
+    None,
+    Ignore,
+    Replaces,
+}
+
+/// Planner for vectored blob reads.
+///
+/// Blob offsets are received via [`VectoredReadPlanner::handle`]
+/// and coalesced into disk reads.
+///
+/// The implementation is very simple:
+/// * Collect all blob offsets in an ordered structure
+/// * Iterate over the collected blobs and coalesce them into reads at the end
+pub struct VectoredReadPlanner {
+    // Track all the blob offsets. Start offsets must be ordered.
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
+    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
+    prev: Option<(Key, Lsn, u64, BlobFlag)>,
+
+    max_read_size: usize,
+}
+
+impl VectoredReadPlanner {
+    pub fn new(max_read_size: usize) -> Self {
+        Self {
+            blobs: BTreeMap::new(),
+            prev: None,
+            max_read_size,
+        }
+    }
+
+    /// Include a new blob in the read plan.
+    ///
+    /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads`
+    /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all
+    /// keys in a given keyspace. This function must be called for each key in the desired
+    /// keyspace (monotonically continuous). [`Self::handle_range_end`] must
+    /// be called after every range in the offset.
+    ///
+    /// In the event that keys are skipped, the behaviour is undefined and can lead to an
+    /// incorrect read plan. We can end up asserting, erroring in wal redo or returning
+    /// incorrect data to the user.
+    ///
+    /// The `flag` argument has two interesting values:
+    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// This is used for WAL records that `will_init`.
+    /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
+    /// if the blob is cached.
+    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
+        // Implementation note: internally lag behind by one blob such that
+        // we have a start and end offset when initialising [`VectoredRead`]
+        let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
+            None => {
+                self.prev = Some((key, lsn, offset, flag));
+                return;
+            }
+            Some(prev) => prev,
+        };
+
+        self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+
+        self.prev = Some((key, lsn, offset, flag));
+    }
+
+    pub fn handle_range_end(&mut self, offset: u64) {
+        if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
+            self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+        }
+
+        self.prev = None;
+    }
+
+    fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
+        match flag {
+            BlobFlag::None => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Replaces => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.clear();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Ignore => {}
+        }
+    }
+
+    pub fn finish(self) -> Vec<VectoredRead> {
+        let mut current_read_builder: Option<VectoredReadBuilder> = None;
+        let mut reads = Vec::new();
+
+        for (key, blobs_for_key) in self.blobs {
+            for (lsn, start_offset, end_offset) in blobs_for_key {
+                let extended = match &mut current_read_builder {
+                    Some(read_builder) => {
+                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
+                    }
+                    None => VectoredReadExtended::No,
+                };
+
+                if extended == VectoredReadExtended::No {
+                    let next_read_builder = VectoredReadBuilder::new(
+                        start_offset,
+                        end_offset,
+                        BlobMeta { key, lsn },
+                        self.max_read_size,
+                    );
+
+                    let prev_read_builder = current_read_builder.replace(next_read_builder);
+
+                    // `current_read_builder` is None in the first iteration of the outer loop
+                    if let Some(read_builder) = prev_read_builder {
+                        reads.push(read_builder.build());
+                    }
+                }
+            }
+        }
+
+        if let Some(read_builder) = current_read_builder {
+            reads.push(read_builder.build());
+        }
+
+        reads
+    }
+}
+
+/// Disk reader for vectored blob spans (does not go through the page cache)
+pub struct VectoredBlobReader<'a> {
+    file: &'a VirtualFile,
+}
+
+impl<'a> VectoredBlobReader<'a> {
+    pub fn new(file: &'a VirtualFile) -> Self {
+        Self { file }
+    }
+
+    /// Read the requested blobs into the buffer.
+    ///
+    /// We have to deal with the fact that blobs are not fixed size.
+    /// Each blob is prefixed by a size header.
+    ///
+    /// The success return value is a struct which contains the buffer
+    /// filled from disk and a list of offsets at which each blob lies
+    /// in the buffer.
+    pub async fn read_blobs(
+        &self,
+        read: &VectoredRead,
+        buf: BytesMut,
+    ) -> Result<VectoredBlobsBuf, std::io::Error> {
+        assert!(read.size() > 0);
+        assert!(
+            read.size() <= buf.capacity(),
+            "{} > {}",
+            read.size(),
+            buf.capacity()
+        );
+        let buf = self
+            .file
+            .read_exact_at_n(buf, read.start, read.size())
+            .await?;
+
+        let blobs_at = read.blobs_at.as_slice();
+        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
+
+        let mut metas = Vec::with_capacity(blobs_at.len());
+
+        // Blobs in `read` only provide their starting offset. The end offset
+        // of a blob is implicit: the start of the next blob if one exists
+        // or the end of the read.
+        let pairs = blobs_at.iter().zip(
+            blobs_at
+                .iter()
+                .map(Some)
+                .skip(1)
+                .chain(std::iter::once(None)),
+        );
+
+        for ((offset, meta), next) in pairs {
+            let offset_in_buf = offset - start_offset;
+            let first_len_byte = buf[offset_in_buf as usize];
+
+            // Each blob is prefixed by a header containing it's size.
+            // Extract the size and skip that header to find the start of the data.
+            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
+            // 1 byte case and 1 in the 4 byte case.
+            let (size_length, blob_size) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64)
+            } else {
+                let mut blob_size_buf = [0u8; 4];
+                let offset_in_buf = offset_in_buf as usize;
+
+                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
+                blob_size_buf[0] &= 0x7f;
+                (4, u32::from_be_bytes(blob_size_buf) as u64)
+            };
+
+            let start = offset_in_buf + size_length;
+            let end = match next {
+                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
+                None => start + blob_size,
+            };
+
+            assert_eq!(end - start, blob_size);
+
+            metas.push(VectoredBlob {
+                start: start as usize,
+                end: end as usize,
+                meta: *meta,
+            })
+        }
+
+        Ok(VectoredBlobsBuf { buf, blobs: metas })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
+        assert_eq!(read.start, offset_range.first().unwrap().2);
+
+        let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
+
+        let offsets_in_read: Vec<_> = read
+            .blobs_at
+            .as_slice()
+            .iter()
+            .map(|(offset, _)| *offset)
+            .collect();
+
+        assert_eq!(expected_offsets_in_read, offsets_in_read);
+    }
+
+    #[test]
+    fn planner_max_read_size_test() {
+        let max_read_size = 128 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
+            (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
+            (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
+            (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
+            (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
+            (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..3],
+            &blob_descriptions[3..4],
+            &blob_descriptions[4..5],
+            &blob_descriptions[5..6],
+            &blob_descriptions[6..7],
+            &blob_descriptions[7..],
+        ];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(652 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 6);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn planner_replacement_test() {
+        let max_read_size = 128 * 1024;
+        let first_key = Key::MIN;
+        let second_key = first_key.next();
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
+            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 3 * 1024, BlobFlag::None),
+            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+        ];
+
+        let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(6 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 2);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -548,7 +548,18 @@ impl VirtualFile {
        B: IoBufMut + Send,
    {
        let (buf, res) =
-            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
+            self.read_at(buf, offset)
+        })
+        .await;
        res.map(|()| buf)
    }

@@ -682,6 +693,7 @@ impl VirtualFile {
 pub async fn read_exact_at_impl<B, F, Fut>(
    buf: B,
    mut offset: u64,
+    count: Option<usize>,
    mut read_at: F,
 ) -> (B, std::io::Result<()>)
 where
@@ -689,7 +701,15 @@ where
    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
-    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    let mut buf: tokio_epoll_uring::Slice<B> = match count {
+        Some(count) => {
+            assert!(count <= buf.bytes_total());
+            assert!(count > 0);
+            buf.slice(..count) // may include uninitialized memory
+        }
+        None => buf.slice_full(), // includes all the uninitialized memory
+    };
+
    while buf.bytes_total() != 0 {
        let res;
        (buf, res) = read_at(buf, offset).await;
@@ -779,7 +799,7 @@ mod test_read_exact_at_impl {
                result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
            }]),
        }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -788,13 +808,33 @@ mod test_read_exact_at_impl {
        assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
    }

+    #[tokio::test]
+    async fn test_with_count() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 3,
+                result: Ok(vec![b'a', b'b', b'c']),
+            }]),
+        }));
+
+        let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c']);
+    }
+
    #[tokio::test]
    async fn test_empty_buf_issues_no_syscall() {
        let buf = Vec::new();
        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
            expectations: VecDeque::new(),
        }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -819,7 +859,7 @@ mod test_read_exact_at_impl {
                },
            ]),
        }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -850,7 +890,7 @@ mod test_read_exact_at_impl {
                },
            ]),
        }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -533,6 +533,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		/* Page is not cached */
 		lfc_ctl->misses += 1;
+		pgBufferUsage.file_cache.misses += 1;
 		LWLockRelease(lfc_lock);
 		return false;
 	}
@@ -558,6 +559,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		Assert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
+		pgBufferUsage.file_cache.hits += 1;
 		Assert(entry->access_count > 0);
 		if (--entry->access_count == 0)
 			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -194,7 +194,6 @@ async fn auth_quirks(
            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;

            ctx.set_endpoint_id(res.info.endpoint.clone());
-            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
            let password = match res.keys {
                ComputeCredentialKeys::Password(p) => p,
                _ => unreachable!("password hack should return a password"),
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -44,7 +44,7 @@ pub(super) async fn authenticate(
            )
            .await
            .map_err(|e| {
-                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
+                warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs());
                auth::AuthError::user_timeout(e)
            })??;

--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,7 +102,6 @@ pub(super) async fn authenticate(

    ctx.set_user(db_info.user.into());
    ctx.set_project(db_info.aux.clone());
-    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));

    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
    // while direct connections do not. Once we migrate to pg_sni_proxy
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -142,10 +142,9 @@ impl ComputeUserInfoMaybeEndpoint {

        if let Some(ep) = &endpoint {
            ctx.set_endpoint_id(ep.clone());
-            tracing::Span::current().record("ep", &tracing::field::display(ep));
        }

-        info!(%user, project = endpoint.as_deref(), "credentials");
+        info!(%user, "credentials");
        if sni.is_some() {
            info!("Connection with sni");
            NUM_CONNECTION_ACCEPTED_BY_SNI
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -98,6 +98,7 @@ pub struct MetricsAuxInfo {
    pub endpoint_id: EndpointId,
    pub project_id: ProjectId,
    pub branch_id: BranchId,
+    pub is_cold_start: Option<bool>,
 }

 #[cfg(test)]
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -5,6 +5,7 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
+use tracing::{field::display, info_span, Span};
 use uuid::Uuid;

 use crate::{
@@ -29,6 +30,7 @@ pub struct RequestMonitoring {
    pub protocol: &'static str,
    first_packet: chrono::DateTime<Utc>,
    region: &'static str,
+    pub span: Span,

    // filled in as they are discovered
    project: Option<ProjectId>,
@@ -40,6 +42,7 @@ pub struct RequestMonitoring {
    error_kind: Option<ErrorKind>,
    pub(crate) auth_method: Option<AuthMethod>,
    success: bool,
+    is_cold_start: Option<bool>,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -63,12 +66,21 @@ impl RequestMonitoring {
        protocol: &'static str,
        region: &'static str,
    ) -> Self {
+        let span = info_span!(
+            "connect_request",
+            %protocol,
+            ?session_id,
+            %peer_addr,
+            ep = tracing::field::Empty,
+        );
+
        Self {
            peer_addr,
            session_id,
            protocol,
            first_packet: Utc::now(),
            region,
+            span,

            project: None,
            branch: None,
@@ -79,6 +91,7 @@ impl RequestMonitoring {
            error_kind: None,
            auth_method: None,
            success: false,
+            is_cold_start: None,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
            latency_timer: LatencyTimer::new(protocol),
@@ -99,9 +112,10 @@ impl RequestMonitoring {
    }

    pub fn set_project(&mut self, x: MetricsAuxInfo) {
+        self.set_endpoint_id(x.endpoint_id);
        self.branch = Some(x.branch_id);
-        self.endpoint_id = Some(x.endpoint_id);
        self.project = Some(x.project_id);
+        self.is_cold_start = x.is_cold_start;
    }

    pub fn set_project_id(&mut self, project_id: ProjectId) {
@@ -109,6 +123,7 @@ impl RequestMonitoring {
    }

    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        self.span.record("ep", display(&endpoint_id));
        crate::metrics::CONNECTING_ENDPOINTS
            .with_label_values(&[self.protocol])
            .measure(&endpoint_id);
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -92,6 +92,8 @@ struct RequestData {
    /// Success is counted if we form a HTTP response with sql rows inside
    /// Or if we make it to proxy_pass
    success: bool,
+    /// Indicates if the cplane started the new compute node for this request.
+    is_cold_start: Option<bool>,
    /// Tracks time from session start (HTTP request/libpq TCP handshake)
    /// Through to success/failure
    duration_us: u64,
@@ -119,6 +121,7 @@ impl From<RequestMonitoring> for RequestData {
            region: value.region,
            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
            success: value.success,
+            is_cold_start: value.is_cold_start,
            duration_us: SystemTime::from(value.first_packet)
                .elapsed()
                .unwrap_or_default()
@@ -452,6 +455,7 @@ mod tests {
            region: "us-east-1",
            error: None,
            success: rng.gen(),
+            is_cold_start: Some(true),
            duration_us: rng.gen_range(0..30_000_000),
        }
    }
@@ -521,15 +525,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1313727, 3, 6000),
-                (1313720, 3, 6000),
-                (1313780, 3, 6000),
-                (1313737, 3, 6000),
-                (1313867, 3, 6000),
-                (1313709, 3, 6000),
-                (1313501, 3, 6000),
-                (1313737, 3, 6000),
-                (438118, 1, 2000)
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
            ],
        );

@@ -559,11 +563,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1219459, 5, 10000),
-                (1225609, 5, 10000),
-                (1227403, 5, 10000),
-                (1226765, 5, 10000),
-                (1218043, 5, 10000)
+                (1220433, 5, 10000),
+                (1226583, 5, 10000),
+                (1228377, 5, 10000),
+                (1227739, 5, 10000),
+                (1219017, 5, 10000)
            ],
        );

@@ -595,11 +599,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1205106, 5, 10000),
-                (1204837, 5, 10000),
-                (1205130, 5, 10000),
-                (1205118, 5, 10000),
-                (1205373, 5, 10000)
+                (1206080, 5, 10000),
+                (1205811, 5, 10000),
+                (1206104, 5, 10000),
+                (1206092, 5, 10000),
+                (1206347, 5, 10000)
            ],
        );

@@ -624,15 +628,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1313727, 3, 6000),
-                (1313720, 3, 6000),
-                (1313780, 3, 6000),
-                (1313737, 3, 6000),
-                (1313867, 3, 6000),
-                (1313709, 3, 6000),
-                (1313501, 3, 6000),
-                (1313737, 3, 6000),
-                (438118, 1, 2000)
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
            ],
        );

@@ -669,7 +673,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(658383, 2, 3001), (658097, 2, 3000), (657893, 2, 2999)],
+            [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
        );

        tmpdir.close().unwrap();
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -22,7 +22,6 @@ use crate::{
    stream::{PqStream, Stream},
    EndpointCacheKey,
 };
-use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
@@ -33,7 +32,7 @@ use std::sync::Arc;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, Instrument};
+use tracing::{error, info, Instrument};

 use self::{
    connect_compute::{connect_to_compute, TcpMechanism},
@@ -83,68 +82,67 @@ pub async fn task_main(
        let cancellation_handler = Arc::clone(&cancellation_handler);
        let endpoint_rate_limiter = endpoint_rate_limiter.clone();

-        let session_span = info_span!(
-            "handle_client",
-            ?session_id,
-            peer_addr = tracing::field::Empty,
-            ep = tracing::field::Empty,
-        );
-
-        connections.spawn(
-            async move {
-                info!("accepted postgres client connection");
-
-                let mut socket = WithClientIp::new(socket);
-                let mut peer_addr = peer_addr.ip();
-                if let Some(addr) = socket.wait_for_addr().await? {
-                    peer_addr = addr.ip();
-                    tracing::Span::current().record("peer_addr", &tracing::field::display(addr));
-                } else if config.require_client_ip {
-                    bail!("missing required client IP");
+        connections.spawn(async move {
+            let mut socket = WithClientIp::new(socket);
+            let mut peer_addr = peer_addr.ip();
+            match socket.wait_for_addr().await {
+                Ok(Some(addr)) => peer_addr = addr.ip(),
+                Err(e) => {
+                    error!("per-client task finished with an error: {e:#}");
+                    return;
                }
+                Ok(None) if config.require_client_ip => {
+                    error!("missing required client IP");
+                    return;
+                }
+                Ok(None) => {}
+            }

-                socket
-                    .inner
-                    .set_nodelay(true)
-                    .context("failed to set socket option")?;
+            match socket.inner.set_nodelay(true) {
+                Ok(()) => {},
+                Err(e) => {
+                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
+                    return;
+                },
+            };

-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let span = ctx.span.clone();

-                let res = handle_client(
-                    config,
-                    &mut ctx,
-                    cancellation_handler,
-                    socket,
-                    ClientMode::Tcp,
-                    endpoint_rate_limiter,
-                )
-                .await;
+            let res = handle_client(
+                config,
+                &mut ctx,
+                cancellation_handler,
+                socket,
+                ClientMode::Tcp,
+                endpoint_rate_limiter,
+            )
+            .instrument(span.clone())
+            .await;

-                match res {
-                    Err(e) => {
-                        // todo: log and push to ctx the error kind
-                        ctx.set_error_kind(e.get_error_kind());
-                        ctx.log();
-                        Err(e.into())
-                    }
-                    Ok(None) => {
-                        ctx.set_success();
-                        ctx.log();
-                        Ok(())
-                    }
-                    Ok(Some(p)) => {
-                        ctx.set_success();
-                        ctx.log();
-                        p.proxy_pass().await
+            match res {
+                Err(e) => {
+                    // todo: log and push to ctx the error kind
+                    ctx.set_error_kind(e.get_error_kind());
+                    ctx.log();
+                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                }
+                Ok(None) => {
+                    ctx.set_success();
+                    ctx.log();
+                }
+                Ok(Some(p)) => {
+                    ctx.set_success();
+                    ctx.log();
+                    match p.proxy_pass().instrument(span.clone()).await {
+                        Ok(()) => {}
+                        Err(e) => {
+                            error!(parent: &span, "per-client task finished with an error: {e:#}");
+                        }
                    }
                }
            }
-            .unwrap_or_else(move |e| {
-                // Acknowledge that the task has finished with an error.
-                error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(session_span),
-        );
+        });
    }

    connections.close();
@@ -232,10 +230,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    mode: ClientMode,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
-    info!(
-        protocol = ctx.protocol,
-        "handling interactive connection from client"
-    );
+    info!("handling interactive connection from client");

    let proto = ctx.protocol;
    let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -17,6 +17,7 @@ use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
+use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -34,13 +34,14 @@ use hyper::{
    Body, Method, Request, Response,
 };

+use std::convert::Infallible;
 use std::net::IpAddr;
 use std::task::Poll;
 use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

 pub const SERVERLESS_DRIVER_SNI: &str = "api";
@@ -134,24 +135,19 @@ pub async fn task_main(
                        let cancellation_handler = cancellation_handler.clone();

                        async move {
-                            let session_id = uuid::Uuid::new_v4();
-
-                            request_handler(
-                                req,
-                                config,
-                                backend,
-                                ws_connections,
-                                cancellation_handler,
-                                session_id,
-                                peer_addr.ip(),
-                                endpoint_rate_limiter,
+                            Ok::<_, Infallible>(
+                                request_handler(
+                                    req,
+                                    config,
+                                    backend,
+                                    ws_connections,
+                                    cancellation_handler,
+                                    peer_addr.ip(),
+                                    endpoint_rate_limiter,
+                                )
+                                .await
+                                .map_or_else(|e| e.into_response(), |r| r),
                            )
-                            .instrument(info_span!(
-                                "serverless",
-                                session = %session_id,
-                                %peer_addr,
-                            ))
-                            .await
                        }
                    },
                )))
@@ -210,10 +206,11 @@ async fn request_handler(
    backend: Arc<PoolingBackend>,
    ws_connections: TaskTracker,
    cancellation_handler: Arc<CancellationHandler>,
-    session_id: uuid::Uuid,
    peer_addr: IpAddr,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Body>, ApiError> {
+    let session_id = uuid::Uuid::new_v4();
+
    let host = request
        .headers()
        .get("host")
@@ -223,15 +220,15 @@ async fn request_handler(

    // Check if the request is a websocket upgrade request.
    if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
+        let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+        let span = ctx.span.clone();
+        info!(parent: &span, "performing websocket upgrade");

        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        ws_connections.spawn(
            async move {
-                let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
-
                if let Err(e) = websocket::serve_websocket(
                    config,
                    ctx,
@@ -242,18 +239,21 @@ async fn request_handler(
                )
                .await
                {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                    error!("error in websocket connection: {e:#}");
                }
            }
-            .in_current_span(),
+            .instrument(span),
        );

        // Return the response so the spawned future can continue.
        Ok(response)
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let span = ctx.span.clone();

-        sql_over_http::handle(config, ctx, request, backend).await
+        sql_over_http::handle(config, ctx, request, backend)
+            .instrument(span)
+            .await
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
        Response::builder()
            .header("Allow", "OPTIONS, POST")
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -21,7 +21,6 @@ use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
 use tracing::error;
 use tracing::info;
-use tracing::instrument;
 use url::Url;
 use utils::http::error::ApiError;
 use utils::http::json::json_response;
@@ -291,7 +290,7 @@ pub async fn handle(
            // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);

            let message = format!(
-                "HTTP-Connection timed out, execution time exeeded {} seconds",
+                "HTTP-Connection timed out, execution time exceeded {} seconds",
                config.http_config.request_timeout.as_secs()
            );
            error!(message);
@@ -309,14 +308,6 @@ pub async fn handle(
    Ok(response)
 }

-#[instrument(
-    name = "sql-over-http",
-    skip_all,
-    fields(
-        pid = tracing::field::Empty,
-        conn_id = tracing::field::Empty
-    )
-)]
 async fn handle_inner(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
@@ -326,10 +317,7 @@ async fn handle_inner(
    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
        .with_label_values(&[ctx.protocol])
        .guard();
-    info!(
-        protocol = ctx.protocol,
-        "handling interactive connection from client"
-    );
+    info!("handling interactive connection from client");

    //
    // Determine the destination and connection params
@@ -337,11 +325,7 @@ async fn handle_inner(
    let headers = request.headers();
    // TLS config should be there.
    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
-    info!(
-        user = conn_info.user_info.user.as_str(),
-        project = conn_info.user_info.endpoint.as_str(),
-        "credentials"
-    );
+    info!(user = conn_info.user_info.user.as_str(), "credentials");

    // Determine the output options. Default behaviour is 'false'. Anything that is not
    // strictly 'true' assumed to be false.
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,7 +12,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
            }
            Ok(())
        }
-        (Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
            format!(
                "JWT scope '{:?}' is ineligible for Safekeeper auth",
                claims.scope
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -17,6 +17,7 @@ import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
+from enum import Enum
 from fcntl import LOCK_EX, LOCK_UN, flock
 from functools import cached_property
 from itertools import chain, product
@@ -388,7 +389,8 @@ class PgProtocol:
 class AuthKeys:
    priv: str

-    def generate_token(self, *, scope: str, **token_data: str) -> str:
+    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
+        token_data = {key: str(val) for key, val in token_data.items()}
        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
        # cast(Any, self.priv)

@@ -401,14 +403,23 @@ class AuthKeys:
        return token

    def generate_pageserver_token(self) -> str:
-        return self.generate_token(scope="pageserverapi")
+        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)

    def generate_safekeeper_token(self) -> str:
-        return self.generate_token(scope="safekeeperdata")
+        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)

    # generate token giving access to only one tenant
    def generate_tenant_token(self, tenant_id: TenantId) -> str:
-        return self.generate_token(scope="tenant", tenant_id=str(tenant_id))
+        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TokenScope(str, Enum):
+    ADMIN = "admin"
+    PAGE_SERVER_API = "pageserverapi"
+    GENERATIONS_API = "generations_api"
+    SAFEKEEPER_DATA = "safekeeperdata"
+    TENANT = "tenant"


 class NeonEnvBuilder:
@@ -1929,6 +1940,13 @@ class Pagectl(AbstractNeonCli):
        return IndexPartDump.from_json(parsed)


+class AttachmentServiceApiException(Exception):
+    def __init__(self, message, status_code: int):
+        super().__init__(message)
+        self.message = message
+        self.status_code = status_code
+
+
 class NeonAttachmentService(MetricsGetter):
    def __init__(self, env: NeonEnv, auth_enabled: bool):
        self.env = env
@@ -1947,39 +1965,60 @@ class NeonAttachmentService(MetricsGetter):
            self.running = False
        return self

+    @staticmethod
+    def raise_api_exception(res: requests.Response):
+        try:
+            res.raise_for_status()
+        except requests.RequestException as e:
+            try:
+                msg = res.json()["msg"]
+            except:  # noqa: E722
+                msg = ""
+            raise AttachmentServiceApiException(msg, res.status_code) from e
+
    def pageserver_api(self) -> PageserverHttpClient:
        """
        The attachment service implements a subset of the pageserver REST API, for mapping
        per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
        functions via the HttpClient, as an implicit check that these APIs remain compatible.
        """
-        return PageserverHttpClient(self.env.attachment_service_port, lambda: True)
+        auth_token = None
+        if self.auth_enabled:
+            auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
+        return PageserverHttpClient(self.env.attachment_service_port, lambda: True, auth_token)

    def request(self, method, *args, **kwargs) -> requests.Response:
-        kwargs["headers"] = self.headers()
-        return requests.request(method, *args, **kwargs)
+        resp = requests.request(method, *args, **kwargs)
+        NeonAttachmentService.raise_api_exception(resp)

-    def headers(self) -> Dict[str, str]:
+        return resp
+
+    def headers(self, scope: Optional[TokenScope]) -> Dict[str, str]:
        headers = {}
-        if self.auth_enabled:
-            jwt_token = self.env.auth_keys.generate_pageserver_token()
+        if self.auth_enabled and scope is not None:
+            jwt_token = self.env.auth_keys.generate_token(scope=scope)
            headers["Authorization"] = f"Bearer {jwt_token}"

        return headers

    def get_metrics(self) -> Metrics:
        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
-        res.raise_for_status()
        return parse_metrics(res.text)

    def ready(self) -> bool:
-        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
-        if resp.status_code == 503:
+        status = None
+        try:
+            resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+            status = resp.status_code
+        except AttachmentServiceApiException as e:
+            status = e.status_code
+
+        if status == 503:
            return False
-        elif resp.status_code == 200:
+        elif status == 200:
            return True
        else:
-            raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
+            raise RuntimeError(f"Unexpected status {status} from readiness endpoint")

    def attach_hook_issue(
        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
@@ -1988,21 +2027,19 @@ class NeonAttachmentService(MetricsGetter):
            "POST",
            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()
        gen = response.json()["gen"]
        assert isinstance(gen, int)
        return gen

    def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
-        response = self.request(
+        self.request(
            "POST",
            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()

    def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]:
        """
@@ -2012,9 +2049,8 @@ class NeonAttachmentService(MetricsGetter):
            "POST",
            f"{self.env.attachment_service_api}/debug/v1/inspect",
            json={"tenant_shard_id": str(tenant_shard_id)},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()
        json = response.json()
        log.info(f"Response: {json}")
        if json["attachment"]:
@@ -2034,14 +2070,15 @@ class NeonAttachmentService(MetricsGetter):
            "POST",
            f"{self.env.attachment_service_api}/control/v1/node",
            json=body,
-            headers=self.headers(),
-        ).raise_for_status()
+            headers=self.headers(TokenScope.ADMIN),
+        )

    def node_list(self):
        response = self.request(
-            "GET", f"{self.env.attachment_service_api}/control/v1/node", headers=self.headers()
+            "GET",
+            f"{self.env.attachment_service_api}/control/v1/node",
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()
        return response.json()

    def node_configure(self, node_id, body: dict[str, Any]):
@@ -2051,8 +2088,8 @@ class NeonAttachmentService(MetricsGetter):
            "PUT",
            f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
            json=body,
-            headers=self.headers(),
-        ).raise_for_status()
+            headers=self.headers(TokenScope.ADMIN),
+        )

    def tenant_create(
        self,
@@ -2077,8 +2114,12 @@ class NeonAttachmentService(MetricsGetter):
            for k, v in tenant_config.items():
                body[k] = v

-        response = self.request("POST", f"{self.env.attachment_service_api}/v1/tenant", json=body)
-        response.raise_for_status()
+        response = self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/v1/tenant",
+            json=body,
+            headers=self.headers(TokenScope.PAGE_SERVER_API),
+        )
        log.info(f"tenant_create success: {response.json()}")

    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
@@ -2086,9 +2127,10 @@ class NeonAttachmentService(MetricsGetter):
        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
        """
        response = self.request(
-            "GET", f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate"
+            "GET",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate",
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()
        body = response.json()
        shards: list[dict[str, Any]] = body["shards"]
        return shards
@@ -2098,20 +2140,20 @@ class NeonAttachmentService(MetricsGetter):
            "PUT",
            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
            json={"new_shard_count": shard_count},
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()
        body = response.json()
        log.info(f"tenant_shard_split success: {body}")
        shards: list[TenantShardId] = body["new_shards"]
        return shards

    def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
-        response = self.request(
+        self.request(
            "PUT",
            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()
        log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
        assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id

@@ -2119,11 +2161,11 @@ class NeonAttachmentService(MetricsGetter):
        """
        Throw an exception if the service finds any inconsistencies in its state
        """
-        response = self.request(
+        self.request(
            "POST",
            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+            headers=self.headers(TokenScope.ADMIN),
        )
-        response.raise_for_status()
        log.info("Attachment service passed consistency check")

    def __enter__(self) -> "NeonAttachmentService":
@@ -2901,7 +2943,6 @@ class NeonProxy(PgProtocol):

    def get_metrics(self) -> str:
        request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
-        request_result.raise_for_status()
        return request_result.text

    @staticmethod
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        force_repartition=False,
+        force_image_layer_creation=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
        if force_repartition:
            query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"

        log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
@@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        force_repartition=False,
+        force_image_layer_creation=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
        if force_repartition:
            query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"

        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -0,0 +1,195 @@
+import asyncio
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.utils import get_scale_for_db, humantime_to_ms
+
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)
+
+
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
+@pytest.mark.parametrize("n_tenants", [10])
+@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
+@pytest.mark.timeout(1000)
+def test_basebackup_with_high_slru_count(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    get_vectored_impl: str,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs)
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            "n_tenants": (n_tenants, {"unit": ""}),
+            "pgbench_scale": (pgbench_scale, {"unit": ""}),
+            "duration": (duration, {"unit": "s"}),
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
+        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+
+    n_txns = 500000
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, n_txns)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+    )
+    run_benchmark(env, pg_bin, record, duration)
+
+
+def setup_tenant_template(env: NeonEnv, n_txns: int):
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+
+    ps_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"]
+    ) as ep:
+        rels = 10
+
+        asyncio.run(run_updates(ep, n_txns, rels))
+
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+
+    return (template_tenant, template_timeline, config)
+
+
+# Takes about 5 minutes and produces tenants with around 300 SLRU blocks
+# of 8 KiB each.
+async def run_updates(ep: Endpoint, n_txns: int, workers_count: int):
+    workers = []
+    for i in range(workers_count):
+        workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i)))
+
+    await asyncio.gather(*workers)
+
+
+async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int):
+    table = f"t_{idx}"
+    conn = await ep.connect_async()
+    await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)")
+    await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)")
+    await conn.execute(f"INSERT INTO {table} VALUES (1, 0)")
+    await conn.execute(
+        """
+         CREATE PROCEDURE updating{0}() as
+         $$
+             DECLARE
+             i integer;
+             BEGIN
+             FOR i IN 1..{1} LOOP
+                 UPDATE {0} SET x = x + 1 WHERE pk=1;
+                 COMMIT;
+             END LOOP;
+             END
+         $$ LANGUAGE plpgsql
+         """.format(table, n_txns)
+    )
+    await conn.execute("SET statement_timeout=0")
+    await conn.execute(f"call updating{table}()")
+
+
+def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int):
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "basebackup",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--gzip-probability",
+        "1",
+        "--runtime",
+        f"{duration_secs}s",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+    metric = "request_count"
+    record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "latency_mean"
+    record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -3,7 +3,6 @@ import os
 from pathlib import Path
 from typing import Any, Dict, Tuple

-import fixtures.pageserver.many_tenants as many_tenants
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
@@ -15,7 +14,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.utils import get_scale_for_db, humantime_to_ms

-from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)


 # For reference, the space usage of the snapshots:
@@ -80,10 +81,77 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(

    for param, (value, kwargs) in params.items():
        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
-    env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, pg_bin, pgbench_scale)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder,
+        f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
+        n_tenants,
+        setup_wrapper,
+    )
    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)


+def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
+    """
+    Set up a template tenant which will be replicated by the test infra.
+    It's a pgbench tenant, initialized to a certain scale, and treated afterwards
+    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
+    """
+    # use a config that makes production of on-disk state timing-insensitive
+    # as we ingest data into the tenant.
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+    ps_http = env.pageserver.http_client()
+    with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+        for _ in range(
+            0, 17
+        ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
+            # the L0s produced by this appear to have size ~5MiB
+            num_txns = 10_000
+            pg_bin.run_capture(
+                ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
+            )
+            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+            ps_http.timeline_checkpoint(template_tenant, template_timeline)
+            ps_http.timeline_compact(template_tenant, template_timeline)
+    # for reference, the output at scale=6 looked like so (306M total)
+    # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
+    # total 306M
+    # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
+    # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
+    #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
+    #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
+    #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
+    # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
+    # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
+    # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
+    # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
+    # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
+
+    return (template_tenant, template_timeline, config)
+
+
 def run_benchmark_max_throughput_latest_lsn(
    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
 ):
@@ -138,78 +206,3 @@ def run_benchmark_max_throughput_latest_lsn(
            unit="ms",
            report=MetricReport.LOWER_IS_BETTER,
        )
-
-
-def setup_pageserver_with_pgbench_tenants(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-    n_tenants: int,
-    scale: int,
-) -> NeonEnv:
-    """
-    Utility function to set up a pageserver with a given number of identical tenants.
-    Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
-    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
-    """
-
-    def setup_template(env: NeonEnv):
-        # use a config that makes production of on-disk state timing-insensitive
-        # as we ingest data into the tenant.
-        config = {
-            "gc_period": "0s",  # disable periodic gc
-            "checkpoint_timeout": "10 years",
-            "compaction_period": "0s",  # disable periodic compaction
-            "compaction_threshold": 10,
-            "compaction_target_size": 134217728,
-            "checkpoint_distance": 268435456,
-            "image_creation_threshold": 3,
-        }
-        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
-        env.pageserver.tenant_detach(template_tenant)
-        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
-            ".*Dropped remote consistent LSN updates.*",
-        )
-        env.pageserver.tenant_attach(template_tenant, config)
-        ps_http = env.pageserver.http_client()
-        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
-            pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
-            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-            ps_http.timeline_checkpoint(template_tenant, template_timeline)
-            ps_http.timeline_compact(template_tenant, template_timeline)
-            for _ in range(
-                0, 17
-            ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
-                # the L0s produced by this appear to have size ~5MiB
-                num_txns = 10_000
-                pg_bin.run_capture(
-                    ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
-                )
-                wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-                ps_http.timeline_checkpoint(template_tenant, template_timeline)
-                ps_http.timeline_compact(template_tenant, template_timeline)
-        # for reference, the output at scale=6 looked like so (306M total)
-        # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
-        # total 306M
-        # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
-        # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
-        #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
-        #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
-        #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
-        # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
-        # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
-        # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
-        # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
-        # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
-
-        return (template_tenant, template_timeline, config)
-
-    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
-        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
-
-    env = neon_env_builder.build_and_use_snapshot(
-        f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
-    )
-    env.start()
-    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
-    return env
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,9 +2,16 @@
 Utilities used by all code in this sub-directory
 """

+from typing import Any, Callable, Dict, Tuple
+
+import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+)
 from fixtures.pageserver.utils import wait_until_all_tenants_state
+from fixtures.types import TenantId, TimelineId


 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
@@ -27,3 +34,22 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
                assert not layer.remote

    log.info("ready")
+
+
+def setup_pageserver_with_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    name: str,
+    n_tenants: int,
+    setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+) -> NeonEnv:
+    """
+    Utility function to set up a pageserver with a given number of identical tenants.
+    """
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(name, doit)
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+    return env
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
    tenant, _ = env.neon_cli.create_tenant(
        conf={
            "gc_period": "0s",
-            "checkpoint_distance": "16384",
+            "checkpoint_distance": "8192",
            "compaction_period": "1 s",
            "compaction_threshold": "1",
-            "compaction_target_size": "16384",
+            "compaction_target_size": "8192",
        }
    )

--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -160,6 +160,9 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "compaction_target_size": 1048576,
        "checkpoint_distance": 10000,
        "checkpoint_timeout": "13m",
+        "compaction_algorithm": {
+            "kind": "Tiered",
+        },
        "eviction_policy": {
            "kind": "LayerAccessThreshold",
            "period": "20s",
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -6,6 +6,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    logical_replication_sync,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion

@@ -52,6 +53,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
    cur.execute("select create_snapshots(10000)")
    # Wait logical replication to sync
    logical_replication_sync(vanilla_pg, endpoint)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
    time.sleep(10)

    # Check layer file sizes
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric(
        initial_tenant_conf={
            # small checkpointing and compaction targets to ensure we generate many upload operations
            "checkpoint_distance": f"{128 * 1024}",
+            # ensure each timeline_checkpoint() calls creates L1s
            "compaction_threshold": "1",
            "compaction_target_size": f"{128 * 1024}",
            # no PITR horizon, we specify the horizon when we request on-demand GC
@@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric(
            # disable background compaction and GC. We invoke it manually when we want it to happen.
            "gc_period": "0s",
            "compaction_period": "0s",
-            # create image layers eagerly, so that GC can remove some layers
-            "image_creation_threshold": "1",
        }
    )

@@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric(
            ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"

    def churn(data_pass1, data_pass2):
+        # overwrite the same data in place, vacuum inbetween, and
+        # and create image layers; then run a gc().
+        # this should
+        # - create new layers
+        # - delete some layers
        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
        gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
        print_gc_result(gc_result)
        assert gc_result["layers_removed"] > 0
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,13 +1,16 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import List
+from typing import Any, Dict, List

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AttachmentServiceApiException,
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
+    TokenScope,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -176,9 +179,6 @@ def test_node_status_after_restart(
    env.attachment_service.stop()
    env.attachment_service.start()

-    # Initially readiness check should fail because we're trying to connect to the offline node
-    assert env.attachment_service.ready() is False
-
    def is_ready():
        assert env.attachment_service.ready() is True

@@ -457,37 +457,40 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):

    # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
    response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+        "GET",
+        f"{env.attachment_service_api}/debug/v1/tenant",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
    )
-    response.raise_for_status()
    assert len(response.json()) == 3

    # Scheduler should report the expected nodes and shard counts
    response = env.attachment_service.request(
        "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
    )
-    response.raise_for_status()
    # Two nodes, in a dict of node_id->node
    assert len(response.json()["nodes"]) == 2
    assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
    assert all(v["may_schedule"] for v in response.json()["nodes"].values())

    response = env.attachment_service.request(
-        "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
+        "POST",
+        f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
    )
-    response.raise_for_status()
    assert len(env.attachment_service.node_list()) == 1

    response = env.attachment_service.request(
-        "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
+        "POST",
+        f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
    )
-    response.raise_for_status()

    # Tenant drop should be reflected in dump output
    response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+        "GET",
+        f"{env.attachment_service_api}/debug/v1/tenant",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
    )
-    response.raise_for_status()
    assert len(response.json()) == 1

    # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
@@ -603,3 +606,64 @@ def test_sharding_service_s3_time_travel_recovery(
        endpoint.safe_psql("SELECT * FROM created_foo;")

    env.attachment_service.consistency_check()
+
+
+def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    svc = env.attachment_service
+    api = env.attachment_service_api
+
+    tenant_id = TenantId.generate()
+    body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("POST", f"{env.attachment_service_api}/v1/tenant", json=body)
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
+
+    # Token with correct scope
+    svc.request(
+        "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API)
+    )
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("GET", f"{api}/debug/v1/tenant")
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request(
+            "GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API)
+        )
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("POST", f"{api}/upcall/v1/re-attach")
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request(
+            "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
+        )
--- a/Show More
+++ b/Show More