Merge pull request #6997 from neondatabase/rc/proxy/2024-03-04

Proxy release 2024-03-04
Merge branch 'release-proxy' into rc/proxy/2024-03-04
2026-06-02 04:50:38 +00:00 · 2024-03-04 17:36:11 +04:00 · 2024-03-04 16:41:46 +04:00 · 2024-03-04 12:33:42 +01:00 · 2024-03-04 10:31:28 +01:00 · 2024-03-04 09:10:04 +00:00
200 changed files with 9626 additions and 2969 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -39,7 +39,7 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -19,7 +19,7 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -62,7 +62,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -214,7 +214,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -362,7 +362,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -461,7 +461,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -558,7 +558,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -0,0 +1,105 @@
+name: Build build-tools image
+
+on:
+  workflow_call:
+    inputs:
+      image-tag:
+        description: "build-tools image tag"
+        required: true
+        type: string
+    outputs:
+      image-tag:
+        description: "build-tools tag"
+        value: ${{ inputs.image-tag }}
+      image:
+        description: "build-tools image"
+        value: neondatabase/build-tools:${{ inputs.image-tag }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: build-build-tools-image-${{ inputs.image-tag }}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
+  build-image:
+    needs: [ check-image ]
+    if: needs.check-image.outputs.found == 'false'
+
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - name: Check `input.tag` is correct
+        env:
+          INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
+          CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
+        run: |
+          if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
+            echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
+            exit 1
+          fi
+
+      - uses: actions/checkout@v3
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p /tmp/.docker-custom
+          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/setup-buildx-action@v2
+
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/build-push-action@v4
+        with:
+          context: .
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.build-tools
+          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
+          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
+
+      - name: Remove custom docker config directory
+        run: |
+          rm -rf /tmp/.docker-custom
+
+  merge-images:
+    needs: [ build-image ]
+    runs-on: ubuntu-latest
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
+                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
+                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -1,124 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-    outputs:
-      build-tools-tag:
-        description: "tag generated for build tools"
-        value: ${{ jobs.tag.outputs.build-tools-tag }}
-
-jobs:
-  check-if-build-tools-dockerfile-changed:
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
-            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
-            exit
-          fi
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  tag:
-    runs-on: ubuntu-latest
-    needs: [ check-if-build-tools-dockerfile-changed ]
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        env:
-          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  kaniko:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
-
-  kaniko-arm:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-  manifest:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - tag
-      - kaniko
-      - kaniko-arm
-      - check-if-build-tools-dockerfile-changed
-
-    steps:
-      - name: Create manifest
-        run: |
-          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,6 +5,7 @@ on:
    branches:
      - main
      - release
+      - release-proxy
  pull_request:

 defaults:
@@ -67,6 +68,8 @@ jobs:
            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
@@ -74,19 +77,25 @@ jobs:
        shell: bash
        id: build-tag

-  build-buildtools-image:
+  check-build-tools-image:
    needs: [ check-permissions ]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
    with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit

  check-codestyle-python:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -115,10 +124,13 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -182,10 +194,13 @@ jobs:
        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # Raise locked memory limit for tokio-epoll-uring.
      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
      # io_uring will account the memory of the CQ and SQ as locked.
@@ -423,10 +438,13 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
@@ -470,10 +488,13 @@ jobs:
  get-benchmarks-durations:
    outputs:
      json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
@@ -500,10 +521,13 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -535,12 +559,15 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}

    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -581,10 +608,13 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
    strategy:
      fail-fast: false
@@ -682,166 +712,146 @@ jobs:
            })

  trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
    needs: [ check-permissions, promote-images, tag ]
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit

  neon-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}

    steps:
      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3

-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Kaniko build neon
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile
+          cache-from: type=registry,ref=neondatabase/neon:cache
+          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{needs.tag.outputs.build-tag}}

-  compute-tools-image:
-    runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-
-      - name: Configure ECR and Docker Hub login
+      - name: Remove custom docker config directory
+        if: always()
        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
-
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
-
-      - name: Kaniko build compute tools
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-tools
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+          rm -rf .docker-custom

  compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: gcr.io/kaniko-project/executor:v1.9.2-debug
-      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
-      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
-      options: --add-host=download.osgeo.org:140.211.15.30
+
    strategy:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}

    steps:
      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
+        with:
+          # Disable parallelism for docker buildkit.
+          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
+          config-inline: |
+            [worker.oci]
+              max-parallelism = 1

-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Kaniko build compute node with extensions
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg PG_VERSION=${{ matrix.version }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-node
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --cleanup
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - name: Build compute-node image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            PG_VERSION=${{ matrix.version }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Build compute-tools image
+        # compute-tools are Postgres independent, so build it only once
+        if: ${{ matrix.version == 'v16' }}
+        uses: docker/build-push-action@v5
+        with:
+          target: compute-tools-image
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom

  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
@@ -885,7 +895,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -919,7 +929,8 @@ jobs:
          fi

      - name: Verify docker-compose example
-        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        timeout-minutes: 20
+        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh

      - name: Print logs and clean up
        if: always()
@@ -952,9 +963,7 @@ jobs:
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
        run: |
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -966,9 +975,7 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -992,9 +999,7 @@ jobs:
          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -1084,7 +1089,7 @@ jobs:

  deploy:
    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1119,14 +1124,28 @@ jobs:
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
+              -f deployStorage=true \
+              -f deployStorageBroker=true \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=true \
+              -f deployProxy=true \
+              -f deployStorage=false \
+              -f deployStorageBroker=false \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
          fi

      - name: Create git tag
-        if: github.ref_name == 'release'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
        uses: actions/github-script@v7
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -1139,6 +1158,7 @@ jobs:
              sha: context.sha,
            })

+      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
      - name: Create GitHub release
        if: github.ref_name == 'release'
        uses: actions/github-script@v7
@@ -1190,3 +1210,11 @@ jobs:

            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
          done
+
+  pin-build-tools-image:
+    needs: [ build-build-tools-image, promote-images, regress-tests ]
+    if: github.ref_name == 'main'
+    uses: ./.github/workflows/pin-build-tools-image.yml
+    with:
+      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
+    secrets: inherit
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -0,0 +1,58 @@
+name: Check build-tools image
+
+on:
+  workflow_call:
+    outputs:
+      image-tag:
+        description: "build-tools image tag"
+        value: ${{ jobs.check-image.outputs.tag }}
+      found:
+        description: "Whether the image is found in the registry"
+        value: ${{ jobs.check-image.outputs.found }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    runs-on: ubuntu-latest
+    outputs:
+      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+      found: ${{ steps.check-image.outputs.found }}
+
+    steps:
+      - name: Get build-tools image tag for the current commit
+        id: get-build-tools-tag
+        env:
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          LAST_BUILD_TOOLS_SHA=$(
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              --method GET \
+              --field path=Dockerfile.build-tools \
+              --field sha=${COMMIT_SHA} \
+              --field per_page=1 \
+              --jq ".[0].sha" \
+              "/repos/${GITHUB_REPOSITORY}/commits"
+          )
+          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
+
+      - name: Check if such tag found in the registry
+        id: check-image
+        env:
+          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+        run: |
+          if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
+            found=true
+          else
+            found=false
+          fi
+
+          echo "found=${found}" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -0,0 +1,32 @@
+# A workflow from
+# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
+
+name: cleanup caches by a branch
+on:
+  pull_request:
+    types:
+      - closed
+
+jobs:
+  cleanup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup
+        run: |
+          gh extension install actions/gh-actions-cache
+
+          echo "Fetching list of cache key"
+          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
+
+          ## Setting this to not fail the workflow while deleting cache keys.
+          set +e
+          echo "Deleting caches..."
+          for cacheKey in $cacheKeysForPR
+          do
+              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
+          done
+          echo "Done"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+          BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,6 +26,17 @@ jobs:
    with:
      github-event-name: ${{ github.event_name}}

+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
  check-macos-build:
    needs: [ check-permissions ]
    if: |
@@ -123,7 +134,7 @@ jobs:
        run: ./run_clippy.sh

  check-linux-arm-build:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
    runs-on: [ self-hosted, dev, arm64 ]

@@ -137,7 +148,10 @@ jobs:
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -244,12 +258,15 @@ jobs:
          cargo nextest run --package remote_storage --test test_real_azure

  check-codestyle-rust-arm:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
    runs-on: [ self-hosted, dev, arm64 ]

    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    steps:
@@ -316,14 +333,17 @@ jobs:
        run: cargo deny check

  gather-rust-build-stats:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

    env:
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -0,0 +1,72 @@
+name: 'Pin build-tools image'
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+  workflow_call:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: pin-build-tools-image-${{ inputs.from-tag }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: ubuntu-latest
+
+    env:
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: pinned
+
+    steps:
+      - name: Check if we really need to pin the image
+        id: check-manifests
+        run: |
+          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
+          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
+
+          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
+            skip=true
+          else
+            skip=false
+          fi
+
+          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,12 +2,31 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 6 * * 1'
+    # It should be kept in sync with if-condition in jobs
+    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * THU' # Proxy release
  workflow_dispatch:
+    inputs:
+      create-storage-release-branch:
+        type: boolean
+        description: 'Create Storage release PR'
+        required: false
+      create-proxy-release-branch:
+        type: boolean
+        description: 'Create Proxy release PR'
+        required: false
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}

 jobs:
-  create_release_branch:
-    runs-on: [ ubuntu-latest ]
+  create-storage-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
@@ -18,27 +37,67 @@ jobs:
      with:
        ref: main

-    - name: Get current date
-      id: date
-      run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV

    - name: Create release branch
-      run: git checkout -b releases/${{ steps.date.outputs.date }}
+      run: git checkout -b $RELEASE_BRANCH

    - name: Push new branch
-      run: git push origin releases/${{ steps.date.outputs.date }}
+      run: git push origin $RELEASE_BRANCH

    - name: Create pull request into release
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Release ${{ steps.date.outputs.date }}
+          ## Release ${RELEASE_DATE}

-          **Please merge this PR using 'Create a merge commit'!**
+          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+        gh pr create --title "Release ${RELEASE_DATE}" \
                     --body-file "body.md" \
-                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --head "${RELEASE_BRANCH}" \
                     --base "release"
+
+  create-proxy-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write # for `git push`
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v4
+      with:
+        ref: main
+
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+
+    - name: Create release branch
+      run: git checkout -b $RELEASE_BRANCH
+
+    - name: Push new branch
+      run: git push origin $RELEASE_BRANCH
+
+    - name: Create pull request into release
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Proxy release ${RELEASE_DATE}
+
+          **Please merge this Pull Request using 'Create a merge commit' button**
+        EOF
+
+        gh pr create --title "Proxy release ${RELEASE_DATE}}" \
+                     --body-file "body.md" \
+                     --head "${RELEASE_BRANCH}" \
+                     --base "release-proxy"
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
            echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -1,70 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-
-    env:
-      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v2
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install crane
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
-
-      - name: Copy images
-        run: |
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ test_output/
 neon.iml
 /.neon
 /integration_tests/.neon
+compaction-suite-results.*

 # Coverage
 *.profraw
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -74,16 +74,11 @@ We're using the following approach to make it work:

 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)

-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+## How do I make build-tools image "pinned"

-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
+It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.

 ```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
+gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
+            -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
+```
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"

 [[package]]
 name = "ahash"
-version = "0.8.5"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
+checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
 dependencies = [
 "cfg-if",
 "const-random",
@@ -1389,9 +1389,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.3"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
 "rustc_version",
 ]
@@ -3498,6 +3498,7 @@ dependencies = [
 "num_cpus",
 "once_cell",
 "pageserver_api",
+ "pageserver_compaction",
 "pin-project-lite",
 "postgres",
 "postgres-protocol",
@@ -3588,6 +3589,53 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_compaction"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-compression",
+ "async-stream",
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "chrono",
+ "clap",
+ "const_format",
+ "consumption_metrics",
+ "criterion",
+ "crossbeam-utils",
+ "either",
+ "fail",
+ "flate2",
+ "futures",
+ "git-version",
+ "hex",
+ "hex-literal",
+ "humantime",
+ "humantime-serde",
+ "itertools",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pin-project-lite",
+ "rand 0.8.5",
+ "smallvec",
+ "svg_fmt",
+ "sync_wrapper",
+ "thiserror",
+ "tokio",
+ "tokio-io-timeout",
+ "tokio-util",
+ "tracing",
+ "tracing-error",
+ "tracing-subscriber",
+ "url",
+ "utils",
+ "walkdir",
+ "workspace_hack",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
    "control_plane",
    "control_plane/attachment_service",
    "pageserver",
+    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
@@ -199,6 +200,7 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -786,6 +786,22 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control

+#########################################################################################
+#
+# Layer "pg_partman"
+# compile pg_partman extension
+#
+#########################################################################################
+FROM build-deps AS pg-partman-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
+    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control

 #########################################################################################
 #
@@ -829,6 +845,7 @@ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -874,7 +891,17 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
+RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+
+#########################################################################################
+#
+# Final compute-tools image
+#
+#########################################################################################
+
+FROM debian:bullseye-slim AS compute-tools-image
+
+COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

 #########################################################################################
 #
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,32 +0,0 @@
-# First transient image to build compute_tools binaries
-# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
-ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
-ARG TAG=pinned
-ARG BUILD_TAG
-
-FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
-WORKDIR /home/nonroot
-
-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-COPY . .
-
-RUN set -e \
-    && mold -run cargo build -p compute_tools --locked --release \
-    && cachepot -s
-
-# Final image that only has one binary
-FROM debian:bullseye-slim
-
-COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

 ## Quick start
-Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.

 Alternatively, compile and run the project [locally](#running-local-installation).

@@ -230,6 +230,12 @@ postgres=# select * from t;
 > cargo neon stop
 ```

+More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
+
+#### Handling build failures
+
+If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
+
 ## Running tests

 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -259,6 +265,12 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 > It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
 > See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).

+## Cleanup
+
+For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
+
+For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
+
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,8 +18,6 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use tokio;
-use tokio_postgres;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -82,6 +82,12 @@ pub fn write_postgres_conf(
        ComputeMode::Replica => {
            // hot_standby is 'on' by default, but let's be explicit
            writeln!(file, "hot_standby=on")?;
+
+            // Inform the replica about the primary state
+            // Default is 'false'
+            if let Some(primary_is_running) = spec.primary_is_running {
+                writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
+            }
        }
    }

--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::{self, Result};
+use anyhow::Result;
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use num_cpus;
-use serde_json;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -655,6 +655,9 @@ pub fn handle_grants(
        // remove this code if possible. The worst thing that could happen is that
        // user won't be able to use public schema in NEW databases created in the
        // very OLD project.
+        //
+        // Also, alter default permissions so that relations created by extensions can be
+        // used by neon_superuser without permission issues.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
@@ -673,6 +676,15 @@ pub fn handle_grants(
                            GRANT CREATE ON SCHEMA public TO web_access;\n\
                        END IF;\n\
                    END IF;\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    )\n\
+                    THEN\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
+                    END IF;\n\
                END\n\
            $$;"
        .to_string();
@@ -777,9 +789,12 @@ BEGIN
 END
 $$;"#,
        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
-        // ensure tables created by superusers (i.e., when creating extensions) can be used by neon_superuser.
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser",
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser",
+        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
+        "",
+        "",
+        "",
+        "",
+        // Add new migrations below.
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -806,8 +821,13 @@ $$;"#,
    client.simple_query(query)?;

    while current_migration < migrations.len() {
-        info!("Running migration:\n{}\n", migrations[current_migration]);
-        client.simple_query(migrations[current_migration])?;
+        let migration = &migrations[current_migration];
+        if migration.is_empty() {
+            info!("Skip migration id={}", current_migration);
+        } else {
+            info!("Running migration:\n{}\n", migration);
+            client.simple_query(migration)?;
+        }
        current_migration += 1;
    }
    let setval = format!(
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -0,0 +1,26 @@
+# Control Plane and Neon Local
+
+This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
+
+## Example: Start with Postgres 16
+
+To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
+
+```shell
+cargo neon init --pg-version 16
+cargo neon start
+cargo neon tenant create --set-default --pg-version 16
+cargo neon endpoint create main --pg-version 16
+cargo neon endpoint start main
+```
+
+## Example: Create Test User and Database
+
+By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
+
+```shell
+cargo neon endpoint create main --pg-version 16 --update-catalog true
+cargo neon endpoint start main --create-test-user true
+```
+
+The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
@@ -0,0 +1,2 @@
+ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
@@ -0,0 +1,4 @@
+
+
+ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
@@ -0,0 +1,9 @@
+use utils::auth::{AuthError, Claims, Scope};
+
+pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
+    if claims.scope != required_scope {
+        return Err(AuthError("Scope mismatch. Permission denied".into()));
+    }
+
+    Ok(())
+}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,17 +1,18 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use utils::auth::SwappableJwtAuth;
-use utils::http::endpoint::{auth_middleware, request_span};
+use utils::auth::{Scope, SwappableJwtAuth};
+use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

@@ -25,12 +26,12 @@ use utils::{
    id::NodeId,
 };

-use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
-
-use control_plane::attachment_service::{
-    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
-    TenantShardMigrateRequest,
+use pageserver_api::controller_api::{
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
+use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
+
+use control_plane::attachment_service::{AttachHookRequest, InspectRequest};

 /// State available to HTTP request handlers
 #[derive(Clone)]
@@ -64,6 +65,8 @@ fn get_state(request: &Request<Body>) -> &HttpState {

 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -72,6 +75,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -81,6 +86,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
 async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
    let state = get_state(&req);

@@ -95,6 +102,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 }

 async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let inspect_req = json_request::<InspectRequest>(&mut req).await?;

    let state = get_state(&req);
@@ -106,10 +115,17 @@ async fn handle_tenant_create(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
+
+    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
+    // have no expectation of HA).
+    let placement_policy = PlacementPolicy::Single;
+
    json_response(
        StatusCode::CREATED,
-        service.tenant_create(create_req).await?,
+        service.tenant_create(create_req, placement_policy).await?,
    )
 }

@@ -164,6 +180,8 @@ async fn handle_tenant_location_config(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
    json_response(
        StatusCode::OK,
@@ -173,11 +191,34 @@ async fn handle_tenant_location_config(
    )
 }

+async fn handle_tenant_config_set(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
+
+    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
+}
+
+async fn handle_tenant_config_get(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
+}
+
 async fn handle_tenant_time_travel_remote_storage(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;

    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -202,7 +243,15 @@ async fn handle_tenant_time_travel_remote_storage(
            done_if_after_raw,
        )
        .await?;
+    json_response(StatusCode::OK, ())
+}

+async fn handle_tenant_secondary_download(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    service.tenant_secondary_download(tenant_id).await?;
    json_response(StatusCode::OK, ())
 }

@@ -211,6 +260,7 @@ async fn handle_tenant_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;

    deletion_wrapper(service, move |service| async move {
        service.tenant_delete(tenant_id).await
@@ -223,6 +273,8 @@ async fn handle_tenant_timeline_create(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
    json_response(
        StatusCode::CREATED,
@@ -237,6 +289,8 @@ async fn handle_tenant_timeline_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    deletion_wrapper(service, move |service| async move {
@@ -250,6 +304,7 @@ async fn handle_tenant_timeline_passthrough(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;

    let Some(path) = req.uri().path_and_query() else {
        // This should never happen, our request router only calls us if there is a path
@@ -293,11 +348,15 @@ async fn handle_tenant_locate(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
    let state = get_state(&req);
    state.service.node_register(register_req).await?;
@@ -305,17 +364,23 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 }

 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }

 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
    if node_id != config_req.node_id {
@@ -335,6 +400,8 @@ async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;

@@ -348,6 +415,8 @@ async fn handle_tenant_shard_migrate(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
    json_response(
@@ -360,22 +429,30 @@ async fn handle_tenant_shard_migrate(

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }

 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    state.service.tenants_dump()
 }

 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);
    state.service.scheduler_dump()
 }

 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -432,6 +509,12 @@ where
    .await
 }

+fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
+    check_permission_with(request, |claims| {
+        crate::auth::check_permission(claims, required_scope)
+    })
+}
+
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
@@ -503,12 +586,21 @@ pub fn make_router(
        .delete("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(r, handle_tenant_delete)
        })
+        .put("/v1/tenant/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_set)
+        })
+        .get("/v1/tenant/:tenant_id/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_get)
+        })
        .put("/v1/tenant/:tenant_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
+        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
+            tenant_service_handler(r, handle_tenant_secondary_download)
+        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(r, handle_tenant_timeline_delete)
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,6 +1,7 @@
 use serde::{Deserialize, Serialize};
 use utils::seqwait::MonotonicCounter;

+mod auth;
 mod compute_hook;
 pub mod http;
 pub mod metrics;
@@ -12,14 +13,20 @@ mod schema;
 pub mod service;
 mod tenant_state;

-#[derive(Clone, Serialize, Deserialize, Debug)]
+#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 enum PlacementPolicy {
    /// Cheapest way to attach a tenant: just one pageserver, no secondary
    Single,
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
-    /// Do not attach to any pageservers
+    /// Create one secondary mode locations. This is useful when onboarding
+    /// a tenant, or for an idle tenant that we might want to bring online quickly.
+    Secondary,
+
+    /// Do not attach to any pageservers.  This is appropriate for tenants that
+    /// have been idle for a long time, where we do not mind some delay in making
+    /// them available in future.
    Detached,
 }

--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
-use aws_config::{self, BehaviorVersion, Region};
+use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -79,13 +79,38 @@ impl Secrets {
        "neon-storage-controller-control-plane-jwt-token";
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";

+    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
+    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
+    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
+
+    /// Load secrets from, in order of preference:
+    /// - CLI args if database URL is provided on the CLI
+    /// - Environment variables if DATABASE_URL is set.
+    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
        match &args.database_url {
            Some(url) => Self::load_cli(url, args),
-            None => Self::load_aws_sm().await,
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
+                Ok(database_url) => Self::load_env(database_url),
+                Err(_) => Self::load_aws_sm().await,
+            },
        }
    }

+    fn load_env(database_url: String) -> anyhow::Result<Self> {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
+            Err(_) => None,
+        };
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
+            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
+        })
+    }
+
    async fn load_aws_sm() -> anyhow::Result<Self> {
        let Ok(region) = std::env::var("AWS_REGION") else {
            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,4 +1,4 @@
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
 use serde::Serialize;
 use utils::id::NodeId;

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,10 +6,12 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
-use diesel::prelude::*;
-use diesel::Connection;
+use diesel::{
+    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
+    Selectable, SelectableHelper,
+};
+use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
@@ -331,7 +333,15 @@ impl Persistence {
                shard_number: ShardNumber(tsp.shard_number as u8),
                shard_count: ShardCount::new(tsp.shard_count as u8),
            };
-            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+
+            let Some(g) = tsp.generation else {
+                // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
+                // we only set generation_pageserver when setting generation.
+                return Err(DatabaseError::Logical(
+                    "Generation should always be set after incrementing".to_string(),
+                ));
+            };
+            result.insert(tenant_shard_id, Generation::new(g as u32));
        }

        Ok(result)
@@ -364,7 +374,85 @@ impl Persistence {
            })
            .await?;

-        Ok(Generation::new(updated.generation as u32))
+        // Generation is always non-null in the rseult: if the generation column had been NULL, then we
+        // should have experienced an SQL Confilict error while executing a query that tries to increment it.
+        debug_assert!(updated.generation.is_some());
+        let Some(g) = updated.generation else {
+            return Err(DatabaseError::Logical(
+                "Generation should always be set after incrementing".to_string(),
+            )
+            .into());
+        };
+
+        Ok(Generation::new(g as u32))
+    }
+
+    /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
+    ///
+    /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
+    /// API: use [`Self::increment_generation`] instead.  Setting the generation via this route is a one-time thing
+    /// that we only do the first time a tenant is set to an attached policy via /location_config.
+    pub(crate) async fn update_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
+        input_generation: Option<Generation>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
+
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            }
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
    }

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -375,7 +463,7 @@ impl Persistence {
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                .set((
-                    generation_pageserver.eq(i64::MAX),
+                    generation_pageserver.eq(Option::<i64>::None),
                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                ))
                .execute(conn)?;
@@ -501,12 +589,15 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) shard_stripe_size: i32,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: i32,
+    // and use the incremented number when attaching.
+    //
+    // Generation is only None when first onboarding a tenant, where it may
+    // be in PlacementPolicy::Secondary and therefore have no valid generation state.
+    pub(crate) generation: Option<i32>,

    // Currently attached pageserver
    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: i64,
+    pub(crate) generation_pageserver: Option<i64>,

    #[serde(default)]
    pub(crate) placement_policy: String,
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
    pub(crate) intent: TargetState,
    pub(crate) config: TenantConfig,
    pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedStale,
-            Some(self.generation),
+            self.generation,
            None,
        );
        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,16 +335,17 @@ impl Reconciler {
        }

        // Increment generation before attaching to new pageserver
-        self.generation = self
-            .persistence
-            .increment_generation(self.tenant_shard_id, dest_ps_id)
-            .await?;
+        self.generation = Some(
+            self.persistence
+                .increment_generation(self.tenant_shard_id, dest_ps_id)
+                .await?,
+        );

        let dest_conf = build_location_config(
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedMulti,
-            Some(self.generation),
+            self.generation,
            None,
        );

@@ -401,7 +402,7 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedSingle,
-            Some(self.generation),
+            self.generation,
            None,
        );
        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -433,22 +434,62 @@ impl Reconciler {

        // If the attached pageserver is not attached, do so now.
        if let Some(node_id) = self.intent.attached {
-            let mut wanted_conf =
-                attached_location_conf(self.generation, &self.shard, &self.config);
+            // If we are in an attached policy, then generation must have been set (null generations
+            // are only present when a tenant is initially loaded with a secondary policy)
+            debug_assert!(self.generation.is_some());
+            let Some(generation) = self.generation else {
+                return Err(ReconcileError::Other(anyhow::anyhow!(
+                    "Attempted to attach with NULL generation"
+                )));
+            };
+
+            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
                    tracing::info!(%node_id, "Observed configuration already correct.")
                }
-                _ => {
+                observed => {
                    // In all cases other than a matching observed configuration, we will
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.
-                    self.generation = self
-                        .persistence
-                        .increment_generation(self.tenant_shard_id, node_id)
-                        .await?;
-                    wanted_conf.generation = self.generation.into();
+
+                    // The general case is to increment the generation.  However, there are cases
+                    // where this is not necessary:
+                    // - if we are only updating the TenantConf part of the location
+                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
+                    //   and the location was already in the correct generation
+                    let increment_generation = match observed {
+                        None => true,
+                        Some(ObservedStateLocation { conf: None }) => true,
+                        Some(ObservedStateLocation {
+                            conf: Some(observed),
+                        }) => {
+                            let generations_match = observed.generation == wanted_conf.generation;
+
+                            use LocationConfigMode::*;
+                            let mode_transition_requires_gen_inc =
+                                match (observed.mode, wanted_conf.mode) {
+                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
+                                    (AttachedSingle, AttachedStale) => false,
+                                    (AttachedMulti, AttachedSingle) => false,
+                                    (lhs, rhs) => lhs != rhs,
+                                };
+
+                            !generations_match || mode_transition_requires_gen_inc
+                        }
+                    };
+
+                    if increment_generation {
+                        let generation = self
+                            .persistence
+                            .increment_generation(self.tenant_shard_id, node_id)
+                            .await?;
+                        self.generation = Some(generation);
+                        wanted_conf.generation = generation.into();
+                    }
                    tracing::info!(%node_id, "Observed configuration requires update.");
                    self.location_config(node_id, wanted_conf, None).await?;
                    self.compute_notify().await?;
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -255,7 +255,7 @@ impl Scheduler {
 pub(crate) mod test_utils {

    use crate::node::Node;
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -284,7 +284,6 @@ pub(crate) mod test_utils {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use utils::id::NodeId;

    use crate::tenant_state::IntentState;
    #[test]
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -17,8 +17,8 @@ diesel::table! {
        shard_number -> Int4,
        shard_count -> Int4,
        shard_stripe_size -> Int4,
-        generation -> Int4,
-        generation_pageserver -> Int8,
+        generation -> Nullable<Int4>,
+        generation_pageserver -> Nullable<Int8>,
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -9,19 +9,20 @@ use std::{

 use anyhow::Context;
 use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
-    TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
-    control_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateResponse, ValidateResponseTenant,
+    controller_api::{
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
+    models::TenantConfigRequest,
+};
+use pageserver_api::{
    models::{
        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
@@ -29,6 +30,10 @@ use pageserver_api::{
        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
+    upcall_api::{
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateResponse, ValidateResponseTenant,
+    },
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
@@ -63,6 +68,11 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 // some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

+// If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
+// tenant shards into this generation, and as long as it remains in this generation, we will accept
+// input generation from future requests as authoritative.
+const INITIAL_GENERATION: Generation = Generation::new(0);
+
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -165,6 +175,21 @@ impl From<ReconcileWaitError> for ApiError {
    }
 }

+#[allow(clippy::large_enum_variant)]
+enum TenantCreateOrUpdate {
+    Create((TenantCreateRequest, PlacementPolicy)),
+    Update(Vec<ShardUpdate>),
+}
+
+struct ShardUpdate {
+    tenant_shard_id: TenantShardId,
+    placement_policy: PlacementPolicy,
+    tenant_config: TenantConfig,
+
+    /// If this is None, generation is not updated.
+    generation: Option<Generation>,
+}
+
 impl Service {
    pub fn get_config(&self) -> &Config {
        &self.config
@@ -569,6 +594,9 @@ impl Service {
        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
        tenant.pending_compute_notification = result.pending_compute_notification;

+        // Let the TenantState know it is idle.
+        tenant.reconcile_complete(result.sequence);
+
        match result.result {
            Ok(()) => {
                for (node_id, loc) in &result.observed.locations {
@@ -659,8 +687,8 @@ impl Service {
            // after when pageservers start up and register.
            let mut node_ids = HashSet::new();
            for tsp in &tenant_shard_persistence {
-                if tsp.generation_pageserver != i64::MAX {
-                    node_ids.insert(tsp.generation_pageserver);
+                if let Some(node_id) = tsp.generation_pageserver {
+                    node_ids.insert(node_id);
                }
            }
            for node_id in node_ids {
@@ -697,18 +725,15 @@ impl Service {
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
-            if tsp.generation_pageserver != i64::MAX {
-                intent.set_attached(
-                    &mut scheduler,
-                    Some(NodeId(tsp.generation_pageserver as u64)),
-                );
+            if let Some(generation_pageserver) = tsp.generation_pageserver {
+                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
            }

            let new_tenant = TenantState {
                tenant_shard_id,
                shard: shard_identity,
                sequence: Sequence::initial(),
-                generation: Generation::new(tsp.generation as u32),
+                generation: tsp.generation.map(|g| Generation::new(g as u32)),
                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
                intent,
                observed: ObservedState::new(),
@@ -788,8 +813,8 @@ impl Service {
                shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: 0,
-                generation: 0,
-                generation_pageserver: i64::MAX,
+                generation: Some(0),
+                generation_pageserver: None,
                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
@@ -844,7 +869,7 @@ impl Service {
            .expect("Checked for existence above");

        if let Some(new_generation) = new_generation {
-            tenant_state.generation = new_generation;
+            tenant_state.generation = Some(new_generation);
        } else {
            // This is a detach notification.  We must update placement policy to avoid re-attaching
            // during background scheduling/reconciliation, or during attachment service restart.
@@ -894,7 +919,7 @@ impl Service {
                    node_id,
                    ObservedStateLocation {
                        conf: Some(attached_location_conf(
-                            tenant_state.generation,
+                            tenant_state.generation.unwrap(),
                            &tenant_state.shard,
                            &tenant_state.config,
                        )),
@@ -908,7 +933,7 @@ impl Service {
        Ok(AttachHookResponse {
            gen: attach_req
                .node_id
-                .map(|_| tenant_state.generation.into().unwrap()),
+                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
        })
    }

@@ -921,7 +946,7 @@ impl Service {
            attachment: tenant_state.and_then(|s| {
                s.intent
                    .get_attached()
-                    .map(|ps| (s.generation.into().unwrap(), ps))
+                    .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
            }),
        }
    }
@@ -936,7 +961,8 @@ impl Service {
            node_id: reattach_req.node_id,
            availability: Some(NodeAvailability::Active),
            scheduling: None,
-        })?;
+        })
+        .await?;

        // Ordering: we must persist generation number updates before making them visible in the in-memory state
        let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
@@ -970,7 +996,17 @@ impl Service {
                continue;
            };

-            shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
+            // If [`Persistence::re_attach`] selected this shard, it must have alread
+            // had a generation set.
+            debug_assert!(shard_state.generation.is_some());
+            let Some(old_gen) = shard_state.generation else {
+                // Should never happen:  would only return incremented generation
+                // for a tenant that already had a non-null generation.
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Generation must be set while re-attaching"
+                )));
+            };
+            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
            if let Some(observed) = shard_state
                .observed
                .locations
@@ -1000,7 +1036,7 @@ impl Service {

        for req_tenant in validate_req.tenants {
            if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Generation::new(req_tenant.gen);
+                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
                tracing::info!(
                    "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                    req_tenant.id,
@@ -1027,8 +1063,9 @@ impl Service {
    pub(crate) async fn tenant_create(
        &self,
        create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
    ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req).await?;
+        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;

        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
        Ok(response)
@@ -1037,6 +1074,7 @@ impl Service {
    pub(crate) async fn do_tenant_create(
        &self,
        create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
        // This service expects to handle sharding itself: it is an error to try and directly create
        // a particular shard here.
@@ -1062,9 +1100,27 @@ impl Service {
            })
            .collect::<Vec<_>>();

-        // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-        // have no expectation of HA).
-        let placement_policy: PlacementPolicy = PlacementPolicy::Single;
+        // If the caller specifies a None generation, it means "start from default".  This is different
+        // to [`Self::tenant_location_config`], where a None generation is used to represent
+        // an incompletely-onboarded tenant.
+        let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
+            tracing::info!(
+                "tenant_create: secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            create_req.generation.map(Generation::new)
+        } else {
+            tracing::info!(
+                "tenant_create: not secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            Some(
+                create_req
+                    .generation
+                    .map(Generation::new)
+                    .unwrap_or(INITIAL_GENERATION),
+            )
+        };

        // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
        // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1076,8 +1132,10 @@ impl Service {
                shard_number: tenant_shard_id.shard_number.0 as i32,
                shard_count: tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
-                generation_pageserver: i64::MAX,
+                generation: initial_generation.map(|g| g.into().unwrap() as i32),
+                // The pageserver is not known until scheduling happens: we will set this column when
+                // incrementing the generation the first time we attach to a pageserver.
+                generation_pageserver: None,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
                splitting: SplitState::default(),
@@ -1117,15 +1175,17 @@ impl Service {
                            ))
                        })?;

-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: entry
+                        if let Some(node_id) = entry.get().intent.get_attached() {
+                            let generation = entry
                                .get()
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: entry.get().generation.into().unwrap(),
-                        });
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }

                        continue;
                    }
@@ -1139,9 +1199,7 @@ impl Service {
                            placement_policy.clone(),
                        );

-                        if let Some(create_gen) = create_req.generation {
-                            state.generation = Generation::new(create_gen);
-                        }
+                        state.generation = initial_generation;
                        state.config = create_req.config.clone();

                        state.schedule(scheduler).map_err(|e| {
@@ -1150,14 +1208,18 @@ impl Service {
                            ))
                        })?;

-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: state
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: state.generation.into().unwrap(),
-                        });
+                        // Only include shards in result if we are attaching: the purpose
+                        // of the response is to tell the caller where the shards are attached.
+                        if let Some(node_id) = state.intent.get_attached() {
+                            let generation = state
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
                        entry.insert(state)
                    }
                };
@@ -1211,12 +1273,114 @@ impl Service {
        Ok(())
    }

-    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
-    /// - Call with mode Attached* to upsert the tenant.
-    /// - Call with mode Detached to switch to PolicyMode::Detached
+    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
+    /// and transform it into either a tenant creation of a series of shard updates.
+    fn tenant_location_config_prepare(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> TenantCreateOrUpdate {
+        let mut updates = Vec::new();
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+
+        // Use location config mode as an indicator of policy.
+        let placement_policy = match req.config.mode {
+            LocationConfigMode::Detached => PlacementPolicy::Detached,
+            LocationConfigMode::Secondary => PlacementPolicy::Secondary,
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                if nodes.len() > 1 {
+                    PlacementPolicy::Double(1)
+                } else {
+                    // Convenience for dev/test: if we just have one pageserver, import
+                    // tenants into Single mode so that scheduling will succeed.
+                    PlacementPolicy::Single
+                }
+            }
+        };
+
+        let mut create = true;
+        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            // Saw an existing shard: this is not a creation
+            create = false;
+
+            // Shards may have initially been created by a Secondary request, where we
+            // would have left generation as None.
+            //
+            // We only update generation the first time we see an attached-mode request,
+            // and if there is no existing generation set. The caller is responsible for
+            // ensuring that no non-storage-controller pageserver ever uses a higher
+            // generation than they passed in here.
+            use LocationConfigMode::*;
+            let set_generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
+                    req.config.generation.map(Generation::new)
+                }
+                _ => None,
+            };
+
+            if shard.policy != placement_policy
+                || shard.config != req.config.tenant_conf
+                || set_generation.is_some()
+            {
+                updates.push(ShardUpdate {
+                    tenant_shard_id: *shard_id,
+                    placement_policy: placement_policy.clone(),
+                    tenant_config: req.config.tenant_conf.clone(),
+                    generation: set_generation,
+                });
+            }
+        }
+
+        if create {
+            use LocationConfigMode::*;
+            let generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
+                // If a caller provided a generation in a non-attached request, ignore it
+                // and leave our generation as None: this enables a subsequent update to set
+                // the generation when setting an attached mode for the first time.
+                _ => None,
+            };
+
+            TenantCreateOrUpdate::Create(
+                // Synthesize a creation request
+                (
+                    TenantCreateRequest {
+                        new_tenant_id: TenantShardId::unsharded(tenant_id),
+                        generation,
+                        shard_parameters: ShardParameters {
+                            // Must preserve the incoming shard_count do distinguish unsharded (0)
+                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                            count: req.tenant_id.shard_count,
+                            // We only import un-sharded or single-sharded tenants, so stripe
+                            // size can be made up arbitrarily here.
+                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                        },
+                        config: req.config.tenant_conf,
+                    },
+                    placement_policy,
+                ),
+            )
+        } else {
+            TenantCreateOrUpdate::Update(updates)
+        }
+    }
+
+    /// This API is used by the cloud control plane to migrate unsharded tenants that it created
+    /// directly with pageservers into this service.
    ///
-    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
-    /// secondary locations.
+    /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
+    /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
+    /// Think of the first attempt to call this API as a transfer of absolute authority over the
+    /// tenant's source of generation numbers.
+    ///
+    /// The mode in this request coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Secondary to either onboard a tenant without attaching it, or
+    ///   to set an existing tenant to PolicyMode::Secondary
+    /// - Call with mode Detached to switch to PolicyMode::Detached
    pub(crate) async fn tenant_location_config(
        &self,
        tenant_id: TenantId,
@@ -1228,131 +1392,96 @@ impl Service {
            )));
        }

-        let mut waiters = Vec::new();
+        // First check if this is a creation or an update
+        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
+
        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
-        let maybe_create = {
-            let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
+        let waiters = match create_or_update {
+            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
+                let (create_resp, waiters) =
+                    self.do_tenant_create(create_req, placement_policy).await?;
+                result.shards = create_resp
+                    .shards
+                    .into_iter()
+                    .map(|s| TenantShardLocation {
+                        node_id: s.node_id,
+                        shard_id: s.shard_id,
+                    })
+                    .collect();
+                waiters
+            }
+            TenantCreateOrUpdate::Update(updates) => {
+                // Persist updates
+                // Ordering: write to the database before applying changes in-memory, so that
+                // we will not appear time-travel backwards on a restart.
+                for ShardUpdate {
+                    tenant_shard_id,
+                    placement_policy,
+                    tenant_config,
+                    generation,
+                } in &updates
+                {
+                    self.persistence
+                        .update_tenant_shard(
+                            *tenant_shard_id,
+                            placement_policy.clone(),
+                            tenant_config.clone(),
+                            *generation,
+                        )
+                        .await?;
+                }

-            // Maybe we have existing shards
-            let mut create = true;
-            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-                // Saw an existing shard: this is not a creation
-                create = false;
+                // Apply updates in-memory
+                let mut waiters = Vec::new();
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let result_tx = locked.result_tx.clone();
+                    let compute_hook = locked.compute_hook.clone();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();

-                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
-                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
-                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
-                // cloud control plane into this service.
+                    for ShardUpdate {
+                        tenant_shard_id,
+                        placement_policy,
+                        tenant_config,
+                        generation: update_generation,
+                    } in updates
+                    {
+                        let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                            tracing::warn!("Shard {tenant_shard_id} removed while updating");
+                            continue;
+                        };

-                // Use location config mode as an indicator of policy: if they ask for
-                // attached we go to default HA attached mode.  If they ask for secondary
-                // we go to secondary-only mode.  If they ask for detached we detach.
-                match req.config.mode {
-                    LocationConfigMode::Detached => {
-                        shard.policy = PlacementPolicy::Detached;
-                    }
-                    LocationConfigMode::Secondary => {
-                        // TODO: implement secondary-only mode.
-                        todo!();
-                    }
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // TODO: persistence for changes in policy
-                        if nodes.len() > 1 {
-                            shard.policy = PlacementPolicy::Double(1)
-                        } else {
-                            // Convenience for dev/test: if we just have one pageserver, import
-                            // tenants into Single mode so that scheduling will succeed.
-                            shard.policy = PlacementPolicy::Single
+                        shard.policy = placement_policy;
+                        shard.config = tenant_config;
+                        if let Some(generation) = update_generation {
+                            shard.generation = Some(generation);
+                        }
+
+                        shard.schedule(scheduler)?;
+
+                        let maybe_waiter = shard.maybe_reconcile(
+                            result_tx.clone(),
+                            nodes,
+                            &compute_hook,
+                            &self.config,
+                            &self.persistence,
+                            &self.gate,
+                            &self.cancel,
+                        );
+                        if let Some(waiter) = maybe_waiter {
+                            waiters.push(waiter);
+                        }
+
+                        if let Some(node_id) = shard.intent.get_attached() {
+                            result.shards.push(TenantShardLocation {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                            })
                        }
                    }
                }
-
-                shard.schedule(scheduler)?;
-
-                let maybe_waiter = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                );
-                if let Some(waiter) = maybe_waiter {
-                    waiters.push(waiter);
-                }
-
-                if let Some(node_id) = shard.intent.get_attached() {
-                    result.shards.push(TenantShardLocation {
-                        shard_id: *shard_id,
-                        node_id: *node_id,
-                    })
-                }
+                waiters
            }
-
-            if create {
-                // Validate request mode
-                match req.config.mode {
-                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
-                        // When using this API to onboard an existing tenant to this service, it must start in
-                        // an attached state, because we need the request to come with a generation
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Imported tenant must be in attached mode"
-                        )));
-                    }
-
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // Pass
-                    }
-                }
-
-                // Validate request generation
-                let Some(generation) = req.config.generation else {
-                    // We can only import attached tenants, because we need the request to come with a generation
-                    return Err(ApiError::BadRequest(anyhow::anyhow!(
-                        "Generation is mandatory when importing tenant"
-                    )));
-                };
-
-                // Synthesize a creation request
-                Some(TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: Some(generation),
-                    shard_parameters: ShardParameters {
-                        // Must preserve the incoming shard_count do distinguish unsharded (0)
-                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                        count: req.tenant_id.shard_count,
-                        // We only import un-sharded or single-sharded tenants, so stripe
-                        // size can be made up arbitrarily here.
-                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                    },
-                    config: req.config.tenant_conf,
-                })
-            } else {
-                None
-            }
-        };
-
-        let waiters = if let Some(create_req) = maybe_create {
-            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
-            result.shards = create_resp
-                .shards
-                .into_iter()
-                .map(|s| TenantShardLocation {
-                    node_id: s.node_id,
-                    shard_id: s.shard_id,
-                })
-                .collect();
-            waiters
-        } else {
-            waiters
        };

        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1372,6 +1501,91 @@ impl Service {
        Ok(result)
    }

+    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
+        let tenant_id = req.tenant_id;
+        let config = req.config;
+
+        self.persistence
+            .update_tenant_config(req.tenant_id, config.clone())
+            .await?;
+
+        let waiters = {
+            let mut waiters = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+                shard.config = config.clone();
+                if let Some(waiter) = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    nodes,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
+                ) {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Treat this as success because we have stored the configuration.  If e.g.
+            // a node was unavailable at this time, it should not stop us accepting a
+            // configuration change.
+            tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn tenant_config_get(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
+        let config = {
+            let locked = self.inner.read().unwrap();
+
+            match locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .next()
+            {
+                Some((_tenant_shard_id, shard)) => shard.config.clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant not found").into(),
+                    ))
+                }
+            }
+        };
+
+        // Unlike the pageserver, we do not have a set of global defaults: the config is
+        // entirely per-tenant.  Therefore the distinction between `tenant_specific_overrides`
+        // and `effective_config` in the response is meaningless, but we retain that syntax
+        // in order to remain compatible with the pageserver API.
+
+        let response = HashMap::from([
+            (
+                "tenant_specific_overrides",
+                serde_json::to_value(&config)
+                    .context("serializing tenant specific overrides")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+            (
+                "effective_config",
+                serde_json::to_value(&config)
+                    .context("serializing effective config")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+        ]);
+
+        Ok(response)
+    }
+
    pub(crate) async fn tenant_time_travel_remote_storage(
        &self,
        time_travel_req: &TenantTimeTravelRequest,
@@ -1457,6 +1671,60 @@ impl Service {
                        })?;
            }
        }
+        Ok(())
+    }
+
+    pub(crate) async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(), ApiError> {
+        // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                for node_id in shard.intent.get_secondary() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
+        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
+
+        // Issue concurrent requests to all shards' locations
+        let mut futs = FuturesUnordered::new();
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            futs.push(async move {
+                let result = client.tenant_secondary_download(tenant_shard_id).await;
+                (result, node)
+            })
+        }
+
+        // Handle any errors returned by pageservers.  This includes cases like this request racing with
+        // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
+        // well as more general cases like 503s, 500s, or timeouts.
+        while let Some((result, node)) = futs.next().await {
+            let Err(e) = result else { continue };
+
+            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
+            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
+            // than they had hoped for.
+            tracing::warn!(
+                "Ignoring tenant secondary download error from pageserver {}: {e}",
+                node.id,
+            );
+        }

        Ok(())
    }
@@ -2036,8 +2304,8 @@ impl Service {
                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                    // populate the correct generation as part of its transaction, to protect us
                    // against racing with changes in the state of the parent.
-                    generation: 0,
-                    generation_pageserver: target.node.id.0 as i64,
+                    generation: None,
+                    generation_pageserver: Some(target.node.id.0 as i64),
                    placement_policy: serde_json::to_string(&policy).unwrap(),
                    // TODO: get the config out of the map
                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2158,7 +2426,8 @@ impl Service {
                        .expect("It was present, we just split it");
                    let old_attached = old_state.intent.get_attached().unwrap();
                    old_state.intent.clear(scheduler);
-                    (old_attached, old_state.generation, old_state.config.clone())
+                    let generation = old_state.generation.expect("Shard must have been attached");
+                    (old_attached, generation, old_state.config.clone())
                };

                for child in child_ids {
@@ -2179,7 +2448,7 @@ impl Service {
                    child_state.observed = ObservedState {
                        locations: child_observed,
                    };
-                    child_state.generation = generation;
+                    child_state.generation = Some(generation);
                    child_state.config = config.clone();

                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2244,6 +2513,7 @@ impl Service {
                match shard.policy {
                    PlacementPolicy::Single => {
                        shard.intent.clear_secondary(scheduler);
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                    }
                    PlacementPolicy::Double(_n) => {
                        // If our new attached node was a secondary, it no longer should be.
@@ -2253,6 +2523,12 @@ impl Service {
                        if let Some(old_attached) = old_attached {
                            shard.intent.push_secondary(scheduler, old_attached);
                        }
+
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+                    }
+                    PlacementPolicy::Secondary => {
+                        shard.intent.clear(scheduler);
+                        shard.intent.push_secondary(scheduler, migrate_req.node_id);
                    }
                    PlacementPolicy::Detached => {
                        return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2260,9 +2536,6 @@ impl Service {
                        )))
                    }
                }
-                shard
-                    .intent
-                    .set_attached(scheduler, Some(migrate_req.node_id));

                tracing::info!("Migrating: new intent {:?}", shard.intent);
                shard.sequence = shard.sequence.next();
@@ -2590,7 +2863,7 @@ impl Service {
                    observed_loc.conf = None;
                }

-                if tenant_state.intent.notify_offline(config_req.node_id) {
+                if tenant_state.intent.demote_attached(config_req.node_id) {
                    tenant_state.sequence = tenant_state.sequence.next();
                    match tenant_state.schedule(scheduler) {
                        Err(e) => {
@@ -2657,6 +2930,9 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
+    ///
+    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
+    /// an attached policy.  We should error out if it isn't.
    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,7 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

 use crate::{metrics, persistence::TenantShardPersistence};
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -53,8 +53,11 @@ pub(crate) struct TenantState {
    pub(crate) sequence: Sequence,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: Generation,
+    // and use the incremented number when attaching.
+    //
+    // None represents an incompletely onboarded tenant via the [`Service::location_config`]
+    // API, where this tenant may only run in PlacementPolicy::Secondary.
+    pub(crate) generation: Option<Generation>,

    // High level description of how the tenant should be set up.  Provided
    // externally.
@@ -181,6 +184,13 @@ impl IntentState {
        }
    }

+    /// Remove the last secondary node from the list of secondaries
+    pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
+        if let Some(node_id) = self.secondary.pop() {
+            scheduler.node_dec_ref(node_id);
+        }
+    }
+
    pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
        if let Some(old_attached) = self.attached.take() {
            scheduler.node_dec_ref(old_attached);
@@ -208,11 +218,13 @@ impl IntentState {
        &self.secondary
    }

-    /// When a node goes offline, we update intents to avoid using it
-    /// as their attached pageserver.
+    /// If the node is in use as the attached location, demote it into
+    /// the list of secondary locations.  This is used when a node goes offline,
+    /// and we want to use a different node for attachment, but not permanently
+    /// forget the location on the offline node.
    ///
    /// Returns true if a change was made
-    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
        if self.attached == Some(node_id) {
            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
            // need to call into it here.
@@ -315,7 +327,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) result: Result<(), ReconcileError>,

    pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
    pub(crate) observed: ObservedState,

    /// Set [`TenantState::pending_compute_notification`] from this flag
@@ -340,7 +352,7 @@ impl TenantState {
            tenant_shard_id,
            policy,
            intent: IntentState::default(),
-            generation: Generation::new(0),
+            generation: Some(Generation::new(0)),
            shard,
            observed: ObservedState::default(),
            config: TenantConfig::default(),
@@ -438,10 +450,16 @@ impl TenantState {
        // more work on the same pageservers we're already using.
        let mut modified = false;

+        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
            Single => {
                // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;

@@ -451,6 +469,23 @@ impl TenantState {
                }
            }
            Double(secondary_count) => {
+                let retain_secondaries = if self.intent.attached.is_none()
+                    && scheduler.node_preferred(&self.intent.secondary).is_some()
+                {
+                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
+                    // one more secondary than we usually would, as one of them will become attached futher down this function.
+                    secondary_count + 1
+                } else {
+                    secondary_count
+                };
+
+                while self.intent.secondary.len() > retain_secondaries {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+
                // Should have exactly one attached, and N secondaries
                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;
@@ -463,15 +498,28 @@ impl TenantState {
                    modified = true;
                }
            }
-            Detached => {
-                // Should have no attached or secondary pageservers
-                if self.intent.attached.is_some() {
-                    self.intent.set_attached(scheduler, None);
+            Secondary => {
+                if let Some(node_id) = self.intent.get_attached() {
+                    // Populate secondary by demoting the attached node
+                    self.intent.demote_attached(*node_id);
+                    modified = true;
+                } else if self.intent.secondary.is_empty() {
+                    // Populate secondary by scheduling a fresh node
+                    let node_id = scheduler.schedule_shard(&[])?;
+                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
+                while self.intent.secondary.len() > 1 {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Detached => {
+                // Never add locations in this mode
+                if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
+                    self.intent.clear(scheduler);
                    modified = true;
                }
            }
@@ -518,7 +566,12 @@ impl TenantState {

    fn dirty(&self) -> bool {
        if let Some(node_id) = self.intent.attached {
-            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
+            // Maybe panic: it is a severe bug if we try to attach while generation is null.
+            let generation = self
+                .generation
+                .expect("Attempted to enter attached state without a generation");
+
+            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -596,6 +649,10 @@ impl TenantState {
        // Reconcile already in flight for the current sequence?
        if let Some(handle) = &self.reconciler {
            if handle.sequence == self.sequence {
+                tracing::info!(
+                    "Reconciliation already in progress for sequence {:?}",
+                    self.sequence,
+                );
                return Some(ReconcilerWaiter {
                    tenant_shard_id: self.tenant_shard_id,
                    seq_wait: self.waiter.clone(),
@@ -615,6 +672,10 @@ impl TenantState {
            return None;
        };

+        // Advance the sequence before spawning a reconciler, so that sequence waiters
+        // can distinguish between before+after the reconcile completes.
+        self.sequence = self.sequence.next();
+
        let reconciler_cancel = cancel.child_token();
        let mut reconciler = Reconciler {
            tenant_shard_id: self.tenant_shard_id,
@@ -716,6 +777,17 @@ impl TenantState {
        })
    }

+    /// Called when a ReconcileResult has been emitted and the service is updating
+    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
+    /// the handle to indicate there is no longer a reconciliation in progress.
+    pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
+        if let Some(reconcile_handle) = &self.reconciler {
+            if reconcile_handle.sequence <= sequence {
+                self.reconciler = None;
+            }
+        }
+    }
+
    // If we had any state at all referring to this node ID, drop it.  Does not
    // attempt to reschedule.
    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -736,13 +808,8 @@ impl TenantState {
            shard_number: self.tenant_shard_id.shard_number.0 as i32,
            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
            shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: self
-                .intent
-                .get_attached()
-                .map(|n| n.0 as i64)
-                .unwrap_or(i64::MAX),
-
+            generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
+            generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
@@ -805,8 +872,10 @@ pub(crate) mod tests {
        assert_ne!(attached_node_id, secondary_node_id);

        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.notify_offline(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
        assert!(changed);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);

        // Update the scheduler state to indicate the node is offline
        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,8 +2,12 @@ use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
+    controller_api::{
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
    models::{
-        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
        TimelineCreateRequest, TimelineInfo,
    },
    shard::TenantShardId,
@@ -11,12 +15,12 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::str::FromStr;
+use std::{fs, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
 use utils::{
-    auth::{Claims, Scope},
+    auth::{encode_from_key_file, Claims, Scope},
    id::{NodeId, TenantId},
 };

@@ -24,7 +28,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: Utf8PathBuf,
-    jwt_token: Option<String>,
+    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
@@ -55,126 +59,6 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-    pub generation: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponse {
-    pub shards: Vec<TenantCreateResponseShard>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeRegisterRequest {
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeConfigureRequest {
-    pub node_id: NodeId,
-
-    pub availability: Option<NodeAvailability>,
-    pub scheduling: Option<NodeSchedulingPolicy>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantLocateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantLocateResponse {
-    pub shards: Vec<TenantLocateResponseShard>,
-    pub shard_params: ShardParameters,
-}
-
-/// Explicitly migrating a particular shard is a low level operation
-/// TODO: higher level "Reschedule tenant" operation where the request
-/// specifies some constraints, e.g. asking it to get off particular node(s)
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
-    pub node_id: NodeId,
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeAvailability {
-    // Normal, happy state
-    Active,
-    // Offline: Tenants shouldn't try to attach here, but they may assume that their
-    // secondary locations on this node still exist.  Newly added nodes are in this
-    // state until we successfully contact them.
-    Offline,
-}
-
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeSchedulingPolicy {
-    Active,
-    Filling,
-    Pause,
-    Draining,
-}
-
-impl FromStr for NodeSchedulingPolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "filling" => Ok(Self::Filling),
-            "pause" => Ok(Self::Pause),
-            "draining" => Ok(Self::Draining),
-            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
-        }
-    }
-}
-
-impl From<NodeSchedulingPolicy> for String {
-    fn from(value: NodeSchedulingPolicy) -> String {
-        use NodeSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Filling => "filling",
-            Pause => "pause",
-            Draining => "draining",
-        }
-        .to_string()
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateResponse {}
-
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
@@ -204,12 +88,11 @@ impl AttachmentService {
            .pageservers
            .first()
            .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key) = match ps_conf.http_auth_type {
+        let (private_key, public_key) = match ps_conf.http_auth_type {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
-                let jwt_token = env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
-                    .unwrap();
+                let private_key_path = env.get_private_key_path();
+                let private_key = fs::read(private_key_path).expect("failed to read private key");

                // If pageserver auth is enabled, this implicitly enables auth for this service,
                // using the same credentials.
@@ -235,7 +118,7 @@ impl AttachmentService {
                } else {
                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
                };
-                (Some(jwt_token), Some(public_key))
+                (Some(private_key), Some(public_key))
            }
        };

@@ -243,7 +126,7 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
-            jwt_token,
+            private_key,
            public_key,
            postgres_port,
            client: reqwest::ClientBuilder::new()
@@ -317,7 +200,7 @@ impl AttachmentService {
                "localhost",
                "-p",
                &format!("{}", self.postgres_port),
-                &DB_NAME,
+                DB_NAME,
            ])
            .output()
            .await
@@ -397,7 +280,10 @@ impl AttachmentService {
        .into_iter()
        .map(|s| s.to_string())
        .collect::<Vec<_>>();
-        if let Some(jwt_token) = &self.jwt_token {
+        if let Some(private_key) = &self.private_key {
+            let claims = Claims::new(None, Scope::PageServerApi);
+            let jwt_token =
+                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
            args.push(format!("--jwt-token={jwt_token}"));
        }

@@ -422,7 +308,7 @@ impl AttachmentService {
            )],
            background_process::InitialPidFile::Create(self.pid_file()),
            || async {
-                match self.status().await {
+                match self.ready().await {
                    Ok(_) => Ok(true),
                    Err(_) => Ok(false),
                }
@@ -468,6 +354,20 @@ impl AttachmentService {
        Ok(())
    }

+    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
+        let category = match path.find('/') {
+            Some(idx) => &path[..idx],
+            None => path,
+        };
+
+        match category {
+            "status" | "ready" => Ok(None),
+            "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
+            "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
+            _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
+        }
+    }
+
    /// Simple HTTP request wrapper for calling into attachment service
    async fn dispatch<RQ, RS>(
        &self,
@@ -493,11 +393,16 @@ impl AttachmentService {
        if let Some(body) = body {
            builder = builder.json(&body)
        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
+        if let Some(private_key) = &self.private_key {
+            println!("Getting claims for path {}", path);
+            if let Some(required_claims) = Self::get_claims_for_path(&path)? {
+                println!("Got claims {:?} for path {}", required_claims, path);
+                let jwt_token = encode_from_key_file(&required_claims, private_key)?;
+                builder = builder.header(
+                    reqwest::header::AUTHORIZATION,
+                    format!("Bearer {jwt_token}"),
+                );
+            }
        }

        let response = builder.send().await?;
@@ -617,8 +522,8 @@ impl AttachmentService {
    }

    #[instrument(skip(self))]
-    pub async fn status(&self) -> anyhow::Result<()> {
-        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
+    pub async fn ready(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
            .await
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,14 +8,15 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::{
-    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
-};
+use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -590,6 +590,7 @@ impl Endpoint {
            remote_extensions,
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
+            primary_is_running: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -604,7 +605,7 @@ impl Endpoint {
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
        if create_test_user {
-            let conn_str = self.connstr("user", "neondb");
+            let conn_str = self.connstr("test", "neondb");
            println!("Also at '{}'", conn_str);
        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -412,14 +412,17 @@ impl LocalEnv {

    // this function is used only for testing purposes in CLI e g generate tokens during init
    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
-        let private_key_path = if self.private_key_path.is_absolute() {
+        let private_key_path = self.get_private_key_path();
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    pub fn get_private_key_path(&self) -> PathBuf {
+        if self.private_key_path.is_absolute() {
            self.private_key_path.to_path_buf()
        } else {
            self.base_data_dir.join(&self.private_key_path)
-        };
-
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
+        }
    }

    //
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,6 +17,7 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
+use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -30,7 +31,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
+use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

@@ -115,7 +116,7 @@ impl PageServerNode {
            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
                    .unwrap();
                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
            }
@@ -352,6 +353,11 @@ impl PageServerNode {
                .remove("compaction_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
+            compaction_algorithm: settings
+                .remove("compaction_algorithm")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Failed to parse 'compaction_algorithm' json")?,
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
@@ -391,11 +397,6 @@ impl PageServerNode {
            evictions_low_residence_duration_metric_threshold: settings
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
-            gc_feedback: settings
-                .remove("gc_feedback")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
            lazy_slru_download: settings
                .remove("lazy_slru_download")
@@ -460,6 +461,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'compaction_threshold' as an integer")?,
+                compaction_algorithm: settings
+                    .remove("compactin_algorithm")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("Failed to parse 'compaction_algorithm' json")?,
                gc_horizon: settings
                    .remove("gc_horizon")
                    .map(|x| x.parse::<u64>())
@@ -501,11 +507,6 @@ impl PageServerNode {
                evictions_low_residence_duration_metric_threshold: settings
                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
-                gc_feedback: settings
-                    .remove("gc_feedback")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'gc_feedback' as bool")?,
                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
                lazy_slru_download: settings
                    .remove("lazy_slru_download")
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.

+"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+
+"admin": Provides access to the control plane and admin APIs of the attachment service.

 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -79,6 +79,12 @@ pub struct ComputeSpec {
    // Stripe size for pageserver sharding, in pages
    #[serde(default)]
    pub shard_stripe_size: Option<usize>,
+
+    // When we are starting a new replica in hot standby mode,
+    // we need to know if the primary is running.
+    // This is used to determine if replica should wait for
+    // RUNNING_XACTS from primary or not.
+    pub primary_is_running: Option<bool>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -0,0 +1,129 @@
+use std::str::FromStr;
+
+/// Request/response types for the storage controller
+/// API (`/control/v1` prefix).  Implemented by the server
+/// in [`attachment_service::http`]
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+use crate::{models::ShardParameters, shard::TenantShardId};
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,6 +307,7 @@ impl KeySpaceRandomAccum {
    }
 }

+#[inline(always)]
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
    let start = key_range.start;
    let end = key_range.end;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,13 +2,14 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

-/// Public API types
-pub mod control_api;
+pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
+/// Public API types
+pub mod upcall_api;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -14,7 +14,6 @@ use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
-use strum_macros;
 use utils::{
    completion,
    history_buffer::HistoryBufferWithDropCounter,
@@ -272,6 +271,8 @@ pub struct TenantConfig {
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -283,7 +284,6 @@ pub struct TenantConfig {
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
-    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
@@ -307,6 +307,13 @@ impl EvictionPolicy {
    }
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum CompactionAlgorithm {
+    Legacy,
+    Tiered,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -1069,7 +1076,6 @@ impl PagestreamBeMessage {

 #[cfg(test)]
 mod tests {
-    use bytes::Buf;
    use serde_json::json;

    use super::*;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
-use thiserror;
 use utils::id::TenantId;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -656,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke

 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use bincode;
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;

    use super::*;

--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
 pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
 pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;

+// From standbydefs.h
+pub const XLOG_RUNNING_XACTS: u8 = 0x10;
+
 // From srlu.h
 pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
 pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -119,11 +119,6 @@ pub fn generate_pg_control(
    // Generate new pg_control needed for bootstrap
    checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;

-    //reset some fields we don't want to preserve
-    //TODO Check this.
-    //We may need to determine the value from twophase data.
-    checkpoint.oldestActiveXid = 0;
-
    //save new values in pg_control
    pg_control.checkPoint = 0;
    pg_control.checkPointCopy = checkpoint;
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -623,9 +623,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

-    use bytes::Bytes;
    use camino_tempfile::tempdir;
-    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

    async fn read_and_check_metadata(
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1040,7 +1040,7 @@ mod tests {
            Some("test/prefix/"),
            Some("/test/prefix/"),
        ];
-        let expected_outputs = vec![
+        let expected_outputs = [
            vec!["", "some/path", "some/path"],
            vec!["/", "/some/path", "/some/path"],
            vec![
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,6 @@
 // For details about authentication see docs/authentication.md

 use arc_swap::ArcSwap;
-use serde;
 use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
@@ -32,6 +31,8 @@ pub enum Scope {
    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
+    // Allows access to control plane managment API and some storage controller endpoints.
+    Admin,
 }

 /// JWT payload. See docs/authentication.md for the format
@@ -204,12 +205,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        //   "scope": "tenant",
        //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
        //   "iss": "neon.controlplane",
-        //   "exp": 1709200879,
        //   "iat": 1678442479
        // }
        // ```
        //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";

        // Check it can be validated with the public key
        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(TaskTrackerToken);
+pub struct Completion {
+    _token: TaskTrackerToken,
+}

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -49,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
    tracker.close();

    let token = tracker.token();
-    (Completion(token), Barrier(tracker))
+    (Completion { _token: token }, Barrier(tracker))
 }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
    borrow::Cow,
    fs::{self, File},
-    io,
+    io::{self, Write},
 };

 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,6 +161,48 @@ pub async fn durable_rename(
    Ok(())
 }

+/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
+///
+/// The file is first written to the specified `tmp_path`, and in a second
+/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
+/// and atomic rename guarantee that, if we crash at any point, there will never
+/// be a partially written file at `final_path` (but maybe at `tmp_path`).
+///
+/// Callers are responsible for serializing calls of this function for a given `final_path`.
+/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
+/// be no error and the content of `final_path` will be the "winner" caller's `content`.
+/// I.e., the atomticity guarantees still hold.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // don't keep the fd open for longer than we have to
+
+    std::fs::rename(tmp_path, final_path)?;
+
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -45,7 +45,7 @@ impl Generation {
        Self::Broken
    }

-    pub fn new(v: u32) -> Self {
+    pub const fn new(v: u32) -> Self {
        Self::Valid(v)
    }

--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tracing::{self, debug, info, info_span, warn, Instrument};
+use tracing::{debug, info, info_span, warn, Instrument};

 use std::future::Future;
 use std::str::FromStr;
@@ -156,6 +156,10 @@ pub struct ChannelWriter {
    buffer: BytesMut,
    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
    written: usize,
+    /// Time spent waiting for the channel to make progress. It is not the same as time to upload a
+    /// buffer because we cannot know anything about that, but this should allow us to understand
+    /// the actual time taken without the time spent `std::thread::park`ed.
+    wait_time: std::time::Duration,
 }

 impl ChannelWriter {
@@ -168,6 +172,7 @@ impl ChannelWriter {
            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
            tx,
            written: 0,
+            wait_time: std::time::Duration::ZERO,
        }
    }

@@ -180,6 +185,8 @@ impl ChannelWriter {
        tracing::trace!(n, "flushing");
        let ready = self.buffer.split().freeze();

+        let wait_started_at = std::time::Instant::now();
+
        // not ideal to call from blocking code to block_on, but we are sure that this
        // operation does not spawn_blocking other tasks
        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -192,6 +199,9 @@ impl ChannelWriter {
            // sending it to the client.
            Ok(())
        });
+
+        self.wait_time += wait_started_at.elapsed();
+
        if res.is_err() {
            return Err(std::io::ErrorKind::BrokenPipe.into());
        }
@@ -202,6 +212,10 @@ impl ChannelWriter {
    pub fn flushed_bytes(&self) -> usize {
        self.written
    }
+
+    pub fn wait_time(&self) -> std::time::Duration {
+        self.wait_time
+    }
 }

 impl std::io::Write for ChannelWriter {
@@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body

    let span = info_span!("blocking");
    tokio::task::spawn_blocking(move || {
+        // there are situations where we lose scraped metrics under load, try to gather some clues
+        // since all nodes are queried this, keep the message count low.
+        let spawned_at = std::time::Instant::now();
+
        let _span = span.entered();
+
        let metrics = metrics::gather();
+
+        let gathered_at = std::time::Instant::now();
+
        let res = encoder
            .encode(&metrics, &mut writer)
            .and_then(|_| writer.flush().map_err(|e| e.into()));

+        // this instant is not when we finally got the full response sent, sending is done by hyper
+        // in another task.
+        let encoded_at = std::time::Instant::now();
+
+        let spawned_in = spawned_at - started_at;
+        let collected_in = gathered_at - spawned_at;
+        // remove the wait time here in case the tcp connection was clogged
+        let encoded_in = encoded_at - gathered_at - writer.wait_time();
+        let total = encoded_at - started_at;
+
        match res {
            Ok(()) => {
                tracing::info!(
                    bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
                    "responded /metrics"
                );
            }
            Err(e) => {
-                tracing::warn!("failed to write out /metrics response: {e:#}");
+                // there is a chance that this error is not the BrokenPipe we generate in the writer
+                // for "closed connection", but it is highly unlikely.
+                tracing::warn!(
+                    after_bytes = writer.flushed_bytes(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
+                    "failed to write out /metrics response: {e:?}"
+                );
                // semantics of this error are quite... unclear. we want to error the stream out to
                // abort the response to somehow notify the client that we failed.
                //
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,7 +415,6 @@ mod tests {

    use super::*;

-    use serde::ser::Serialize;
    use serde_assert::{Deserializer, Serializer, Token, Tokens};

    #[test]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]

-use std::cmp::{Eq, Ordering, PartialOrd};
+use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
@@ -249,7 +249,6 @@ where
 mod tests {
    use super::*;
    use std::sync::Arc;
-    use std::time::Duration;

    impl MonotonicCounter<i32> for i32 {
        fn cnt_advance(&mut self, val: i32) {
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -221,7 +221,7 @@ impl RcuWaitList {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use std::sync::{Arc, Mutex};
+    use std::sync::Mutex;
    use std::time::Duration;

    #[tokio::test]
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -239,7 +239,6 @@ mod tests {
    use std::{
        convert::Infallible,
        pin::{pin, Pin},
-        sync::atomic::{AtomicUsize, Ordering},
        time::Duration,
    };

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -73,6 +73,7 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
+pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -0,0 +1,54 @@
+[package]
+name = "pageserver_compaction"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[features]
+default = []
+
+[dependencies]
+anyhow.workspace = true
+async-compression.workspace = true
+async-stream.workspace = true
+async-trait.workspace = true
+byteorder.workspace = true
+bytes.workspace = true
+chrono = { workspace = true, features = ["serde"] }
+clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
+consumption_metrics.workspace = true
+crossbeam-utils.workspace = true
+either.workspace = true
+flate2.workspace = true
+fail.workspace = true
+futures.workspace = true
+git-version.workspace = true
+hex.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
+itertools.workspace = true
+once_cell.workspace = true
+pageserver_api.workspace = true
+pin-project-lite.workspace = true
+rand.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+svg_fmt.workspace = true
+sync_wrapper.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-error.workspace = true
+tracing-subscriber.workspace = true
+url.workspace = true
+walkdir.workspace = true
+metrics.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+hex-literal.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
--- a/pageserver/compaction/TODO.md
+++ b/pageserver/compaction/TODO.md
@@ -0,0 +1,51 @@
+# TODO
+
+- If the key space can be perfectly partitioned at some key, perform planning on each
+  partition separately. For example, if we are compacting a level with layers like this:
+
+  ```
+              :
+  +--+ +----+ :  +------+
+  |  | |    | :  |      |
+  +--+ +----+ :  +------+
+              :
+  +-----+ +-+ : +--------+
+  |     | | | : |        |
+  +-----+ +-+ : +--------+
+              :
+  ```
+
+  At the dotted line, there is a natural split in the key space, such that all
+  layers are either on the left or the right of it. We can compact the
+  partitions separately.  We could choose to create image layers for one
+  partition but not the other one, for example.
+
+- All the layers don't have to be exactly the same size, we can choose to cut a
+  layer short or stretch it a little larger than the target size, if it helps
+  the overall system. We can help perfect partitions (see previous bullet point)
+  to happen more frequently, by choosing the cut points wisely. For example, try
+  to cut layers at boundaries of underlying image layers. And "snap to grid",
+  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
+
+- Avoid rewriting layers when we'd just create an identical layer to an input
+  layer.
+
+- Parallelism. The code is already split up into planning and execution, so that
+  we first split up the compaction work into "Jobs", and then execute them.
+  It would be straightforward to execute multiple jobs in parallel.
+
+- Materialize extra pages in delta layers during compaction. This would reduce
+  read amplification. There has been the idea of partial image layers. Materializing
+  extra pages in the delta layers achieve the same goal, without introducing a new
+  concept.
+
+## Simulator
+
+- Expand the simulator for more workloads
+- Automate a test suite that runs the simluator with different workloads and
+  spits out a table of results
+- Model read amplification
+- More sanity checking. One idea is to keep a reference count of each
+  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
+  a MockRecord that is newer than PITR horizon is completely dropped. That would
+  indicate that the record was lost.
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -0,0 +1,214 @@
+use clap::{Parser, Subcommand};
+use pageserver_compaction::simulator::MockTimeline;
+use rand::Rng;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::sync::OnceLock;
+
+use utils::project_git_version;
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver compaction simulator",
+    long_about = "A developer tool to visualize and test compaction"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    RunSuite,
+    Simulate(SimulateCmd),
+}
+
+#[derive(Clone, clap::ValueEnum)]
+enum Distribution {
+    Uniform,
+    HotCold,
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct SimulateCmd {
+    distribution: Distribution,
+
+    /// Number of records to digest
+    num_records: u64,
+    /// Record length
+    record_len: u64,
+
+    // Logical database size in MB
+    logical_size: u64,
+}
+
+async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
+    let mut executor = MockTimeline::new();
+
+    // Convert the logical size in MB into a key range.
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    //let key_range = u64::MIN..u64::MAX;
+    println!(
+        "starting simulation with key range {:016X}-{:016X}",
+        key_range.start, key_range.end
+    );
+
+    // helper function to print progress indicator
+    let print_progress = |i| -> anyhow::Result<()> {
+        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
+            print!(
+                "\ringested {} / {} records, {} MiB / {} MiB...",
+                i + 1,
+                cmd.num_records,
+                (i + 1) * cmd.record_len / (1_000_000),
+                cmd.num_records * cmd.record_len / (1_000_000),
+            );
+            std::io::stdout().flush()?;
+        }
+        Ok(())
+    };
+
+    match cmd.distribution {
+        Distribution::Uniform => {
+            for i in 0..cmd.num_records {
+                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+        Distribution::HotCold => {
+            let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
+            let hot_key_range = 0..splitpoint;
+            let cold_key_range = splitpoint..key_range.end;
+
+            for i in 0..cmd.num_records {
+                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
+                    &hot_key_range
+                } else {
+                    &cold_key_range
+                };
+                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+    }
+    println!("done!");
+    executor.flush_l0();
+    executor.compact_if_needed().await?;
+    let stats = executor.stats()?;
+
+    // Print the stats to stdout, and also to a file
+    print!("{stats}");
+    std::fs::write(results_path.join("stats.txt"), stats)?;
+
+    let animation_path = results_path.join("compaction-animation.html");
+    executor.draw_history(std::fs::File::create(&animation_path)?)?;
+    println!(
+        "animation: file://{}",
+        animation_path.canonicalize()?.display()
+    );
+
+    Ok(())
+}
+
+async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
+    std::fs::create_dir(results_path)?;
+
+    set_log_file(File::create(results_path.join("log"))?);
+    let result = simulate(workload, results_path).await;
+    set_log_stdout();
+    result
+}
+
+async fn run_suite() -> anyhow::Result<()> {
+    let top_results_path = PathBuf::from(format!(
+        "compaction-suite-results.{}",
+        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
+    ));
+    std::fs::create_dir(&top_results_path)?;
+
+    let workload = SimulateCmd {
+        distribution: Distribution::Uniform,
+        // Generate 20 GB of WAL
+        record_len: 1_000,
+        num_records: 20_000_000,
+        // Logical size 5 GB
+        logical_size: 5_000,
+    };
+
+    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
+
+    println!(
+        "All tests finished. Results in {}",
+        top_results_path.display()
+    );
+    Ok(())
+}
+
+use std::fs::File;
+use std::io::Stdout;
+use std::sync::Mutex;
+use tracing_subscriber::fmt::writer::EitherWriter;
+use tracing_subscriber::fmt::MakeWriter;
+
+static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
+fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
+    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
+}
+
+fn set_log_file(f: File) {
+    *get_log_output().lock().unwrap() = EitherWriter::A(f);
+}
+
+fn set_log_stdout() {
+    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
+}
+
+fn init_logging() -> anyhow::Result<()> {
+    // We fall back to printing all spans at info-level or above if
+    // the RUST_LOG environment variable is not set.
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };
+
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    tracing_subscriber::registry()
+        .with({
+            let log_layer = tracing_subscriber::fmt::layer()
+                .with_target(false)
+                .with_ansi(false)
+                .with_writer(|| get_log_output().make_writer());
+            log_layer.with_filter(rust_log_env_filter())
+        })
+        .init();
+
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    init_logging()?;
+
+    match cli.command {
+        Commands::Simulate(cmd) => {
+            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
+        }
+        Commands::RunSuite => {
+            run_suite().await?;
+        }
+    };
+    Ok(())
+}
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -0,0 +1,866 @@
+//! # Tiered compaction algorithm.
+//!
+//! Read all the input delta files, and write a new set of delta files that
+//! include all the input WAL records. See retile_deltas().
+//!
+//! In a "normal" LSM tree, you get to remove any values that are overwritten by
+//! later values, but in our system, we keep all the history. So the reshuffling
+//! doesn't remove any garbage, it just reshuffles the records to reduce read
+//! amplification, i.e. the number of files that you need to access to find the
+//! WAL records for a given key.
+//!
+//! If the new delta files would be very "narrow", i.e. each file would cover
+//! only a narrow key range, then we create a new set of image files
+//! instead. The current threshold is that if the estimated total size of the
+//! image layers is smaller than the size of the deltas, then we create image
+//! layers. That amounts to 2x storage amplification, and it means that the
+//! distance of image layers in LSN dimension is roughly equal to the logical
+//! database size. For example, if the logical database size is 10 GB, we would
+//! generate new image layers every 10 GB of WAL.
+use futures::StreamExt;
+use tracing::{debug, info};
+
+use std::collections::{HashSet, VecDeque};
+use std::ops::Range;
+
+use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::interface::*;
+use utils::lsn::Lsn;
+
+use crate::identify_levels::identify_level;
+
+/// Main entry point to compaction.
+///
+/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
+/// everything below that point, that needs compaction. The cutoff LSN must
+/// partition the layers so that there are no layers that span across that
+/// LSN. To start compaction at the top of the tree, pass the end LSN of the
+/// written last L0 layer.
+pub async fn compact_tiered<E: CompactionJobExecutor>(
+    executor: &mut E,
+    end_lsn: Lsn,
+    target_file_size: u64,
+    fanout: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<()> {
+    assert!(fanout >= 2);
+    // Start at L0
+    let mut current_level_no = 0;
+    let mut current_level_target_height = target_file_size;
+    loop {
+        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
+        let all_layers = executor
+            .get_layers(
+                &(E::Key::MIN..E::Key::MAX),
+                &(Lsn(u64::MIN)..end_lsn + 1),
+                ctx,
+            )
+            .await?;
+        info!(
+            "Compacting L{}, total # of layers: {}",
+            current_level_no,
+            all_layers.len()
+        );
+
+        // Identify the range of LSNs that belong to this level. We assume that
+        // each file in this level span an LSN range up to 1.75x target file
+        // size. That should give us enough slop that if we created a slightly
+        // oversized L0 layer, e.g. because flushing the in-memory layer was
+        // delayed for some reason, we don't consider the oversized layer to
+        // belong to L1. But not too much slop, that we don't accidentally
+        // "skip" levels.
+        let max_height = (current_level_target_height as f64 * 1.75) as u64;
+        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
+            break;
+        };
+
+        // Calculate the height of this level. If the # of tiers exceeds the
+        // fanout parameter, it's time to compact it.
+        let depth = level.depth();
+        info!(
+            "Level {} identified as LSN range {}-{}: depth {}",
+            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
+        );
+        for l in &level.layers {
+            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
+        }
+        if depth < fanout {
+            debug!(
+                level = current_level_no,
+                depth = depth,
+                fanout,
+                "too few deltas to compact"
+            );
+            break;
+        }
+
+        compact_level(
+            &level.lsn_range,
+            &level.layers,
+            executor,
+            target_file_size,
+            ctx,
+        )
+        .await?;
+        if target_file_size == u64::MAX {
+            break;
+        }
+        current_level_no += 1;
+        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+    }
+    Ok(())
+}
+
+async fn compact_level<E: CompactionJobExecutor>(
+    lsn_range: &Range<Lsn>,
+    layers: &[E::Layer],
+    executor: &mut E,
+    target_file_size: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<bool> {
+    let mut layer_fragments = Vec::new();
+    for l in layers {
+        layer_fragments.push(LayerFragment::new(l.clone()));
+    }
+
+    let mut state = LevelCompactionState {
+        target_file_size,
+        _lsn_range: lsn_range.clone(),
+        layers: layer_fragments,
+        jobs: Vec::new(),
+        job_queue: Vec::new(),
+        next_level: false,
+        executor,
+    };
+
+    let first_job = CompactionJob {
+        key_range: E::Key::MIN..E::Key::MAX,
+        lsn_range: lsn_range.clone(),
+        strategy: CompactionStrategy::Divide,
+        input_layers: state
+            .layers
+            .iter()
+            .enumerate()
+            .map(|i| LayerId(i.0))
+            .collect(),
+        completed: false,
+    };
+
+    state.jobs.push(first_job);
+    state.job_queue.push(JobId(0));
+    state.execute(ctx).await?;
+
+    info!(
+        "compaction completed! Need to process next level: {}",
+        state.next_level
+    );
+
+    Ok(state.next_level)
+}
+
+/// Blackboard that keeps track of the state of all the jobs and work remaining
+struct LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    // parameters
+    target_file_size: u64,
+
+    _lsn_range: Range<Lsn>,
+    layers: Vec<LayerFragment<E>>,
+
+    // job queue
+    jobs: Vec<CompactionJob<E>>,
+    job_queue: Vec<JobId>,
+
+    /// If false, no need to compact levels below this
+    next_level: bool,
+
+    /// Interface to the outside world
+    executor: &'a mut E,
+}
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct LayerId(usize);
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct JobId(usize);
+
+struct PendingJobSet {
+    pending: HashSet<JobId>,
+    completed: HashSet<JobId>,
+}
+
+impl PendingJobSet {
+    fn new() -> Self {
+        PendingJobSet {
+            pending: HashSet::new(),
+            completed: HashSet::new(),
+        }
+    }
+
+    fn complete_job(&mut self, job_id: JobId) {
+        self.pending.remove(&job_id);
+        self.completed.insert(job_id);
+    }
+
+    fn all_completed(&self) -> bool {
+        self.pending.is_empty()
+    }
+}
+
+// When we decide to rewrite a set of layers, LayerFragment is used to keep
+// track which new layers supersede an old layer. When all the stakeholder jobs
+// have completed, this layer can be deleted.
+struct LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    layer: E::Layer,
+
+    // If we will write new layers to replace this one, this keeps track of the
+    // jobs that need to complete before this layer can be deleted. As the jobs
+    // complete, they are moved from 'pending' to 'completed' set. Once the
+    // 'pending' set becomes empty, the layer can be deleted.
+    //
+    // If None, this layer is not rewritten and must not be deleted.
+    deletable_after: Option<PendingJobSet>,
+
+    deleted: bool,
+}
+
+impl<E> LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    fn new(layer: E::Layer) -> Self {
+        LayerFragment {
+            layer,
+            deletable_after: None,
+            deleted: false,
+        }
+    }
+}
+
+#[derive(PartialEq)]
+enum CompactionStrategy {
+    Divide,
+    CreateDelta,
+    CreateImage,
+}
+
+#[allow(dead_code)] // Todo
+struct CompactionJob<E: CompactionJobExecutor> {
+    key_range: Range<E::Key>,
+    lsn_range: Range<Lsn>,
+
+    strategy: CompactionStrategy,
+
+    input_layers: Vec<LayerId>,
+
+    completed: bool,
+}
+
+impl<'a, E> LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    /// Main loop of the executor.
+    ///
+    /// In each iteration, we take the next job from the queue, and execute it.
+    /// The execution might add new jobs to the queue. Keep going until the
+    /// queue is empty.
+    ///
+    /// Initially, the job queue consists of one Divide job over the whole
+    /// level. On first call, it is divided into smaller jobs.
+    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
+        while let Some(next_job_id) = self.job_queue.pop() {
+            info!("executing job {}", next_job_id.0);
+            self.execute_job(next_job_id, ctx).await?;
+        }
+
+        // all done!
+        Ok(())
+    }
+
+    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        match job.strategy {
+            CompactionStrategy::Divide => {
+                self.divide_job(job_id, ctx).await?;
+                Ok(())
+            }
+            CompactionStrategy::CreateDelta => {
+                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+                let mut layer_ids: Vec<LayerId> = Vec::new();
+                for layer_id in &job.input_layers {
+                    let layer = &self.layers[layer_id.0].layer;
+                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
+                        deltas.push(dl.clone());
+                        layer_ids.push(*layer_id);
+                    }
+                }
+
+                self.executor
+                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // did we complete any fragments?
+                for layer_id in layer_ids {
+                    let l = &mut self.layers[layer_id.0];
+                    if let Some(deletable_after) = l.deletable_after.as_mut() {
+                        deletable_after.complete_job(job_id);
+                        if deletable_after.all_completed() {
+                            self.executor.delete_layer(&l.layer, ctx).await?;
+                            l.deleted = true;
+                        }
+                    }
+                }
+
+                self.next_level = true;
+
+                Ok(())
+            }
+            CompactionStrategy::CreateImage => {
+                self.executor
+                    .create_image(job.lsn_range.end, &job.key_range, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // TODO: we could check if any layers < PITR horizon became deletable
+                Ok(())
+            }
+        }
+    }
+
+    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
+        let job_id = JobId(self.jobs.len());
+        self.jobs.push(job);
+        self.job_queue.push(job_id);
+        job_id
+    }
+
+    /// Take a partition of the key space, and decide how to compact it.
+    ///
+    /// TODO: Currently, this is called exactly once for the level, and we
+    /// decide whether to create new image layers to cover the whole level, or
+    /// write a new set of delta. In the future, this should try to partition
+    /// the key space, and make the decision separately for each partition.
+    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Check for dummy cases
+        if job.input_layers.is_empty() {
+            return Ok(());
+        }
+
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Would it be better to create images for this partition?
+        // Decide based on the average density of the level
+        let keyspace_size = keyspace_total_size(
+            &self
+                .executor
+                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+                .await?,
+        ) * 8192;
+
+        let wal_size = job
+            .input_layers
+            .iter()
+            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
+            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
+            .sum::<u64>();
+        if keyspace_size < wal_size {
+            // seems worth it
+            info!(
+                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
+                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
+            );
+            self.cover_with_images(job_id, ctx).await
+        } else {
+            // do deltas
+            info!(
+                "coverage not worth it, keyspace_size {}, wal_size {}",
+                keyspace_size, wal_size
+            );
+            self.retile_deltas(job_id, ctx).await
+        }
+    }
+
+    // LSN
+    //  ^
+    //  |
+    //  |                          ###|###|#####
+    //  | +--+-----+--+            +--+-----+--+
+    //  | |  |     |  |            |  |     |  |
+    //  | +--+--+--+--+            +--+--+--+--+
+    //  | |     |     |            |     |     |
+    //  | +---+-+-+---+     ==>    +---+-+-+---+
+    //  | |   |   |   |            |   |   |   |
+    //  | +---+-+-++--+            +---+-+-++--+
+    //  | |     |  |  |            |     |  |  |
+    //  | +-----+--+--+            +-----+--+--+
+    //  |
+    //  +--------------> key
+    //
+    async fn cover_with_images(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // XXX: do we still need the "holes" stuff?
+
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let keyspace = self
+            .executor
+            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+            .await?;
+
+        let mut window = KeyspaceWindow::new(
+            E::Key::MIN..E::Key::MAX,
+            keyspace,
+            self.target_file_size / 8192,
+        );
+        while let Some(key_range) = window.choose_next_image() {
+            new_jobs.push(CompactionJob::<E> {
+                key_range,
+                lsn_range: job.lsn_range.clone(),
+                strategy: CompactionStrategy::CreateImage,
+                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
+                completed: false,
+            });
+        }
+
+        for j in new_jobs.into_iter().rev() {
+            let _job_id = self.push_job(j);
+
+            // TODO: image layers don't let us delete anything. unless < PITR horizon
+            //let j = &self.jobs[job_id.0];
+            // for layer_id in j.input_layers.iter() {
+            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
+            //}
+        }
+
+        Ok(())
+    }
+
+    // Merge the contents of all the input delta layers into a new set
+    // of delta layers, based on the current partitioning.
+    //
+    // We split the new delta layers on the key dimension. We iterate through
+    // the key space, and for each key, check if including the next key to the
+    // current output layer we're building would cause the layer to become too
+    // large. If so, dump the current output layer and start new one.  It's
+    // possible that there is a single key with so many page versions that
+    // storing all of them in a single layer file would be too large. In that
+    // case, we also split on the LSN dimension.
+    //
+    // LSN
+    //  ^
+    //  |
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    //
+    // If one key (X) has a lot of page versions:
+    //
+    // LSN
+    //  ^
+    //  |                                 (X)
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  +--+  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  +--+  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    // TODO: this actually divides the layers into fixed-size chunks, not
+    // based on the partitioning.
+    //
+    // TODO: we should also opportunistically materialize and
+    // garbage collect what we can.
+    async fn retile_deltas(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Sweep the key space left to right, running an estimate of how much
+        // disk size and keyspace we have accumulated
+        //
+        // Once the disk size reaches the target threshold, stop and think.
+        // If we have accumulated only a narrow band of keyspace, create an
+        // image layer. Otherwise write a delta layer.
+
+        // FIXME: deal with the case of lots of values for same key
+
+        // FIXME: we are ignoring images here. Did we already divide the work
+        // so that we won't encounter them here?
+
+        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+        for layer_id in &job.input_layers {
+            let l = &self.layers[layer_id.0];
+            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
+                deltas.push(dl.clone());
+            }
+        }
+        // Open stream
+        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut all_in_window: bool = false;
+        let mut window = Window::new();
+        loop {
+            if all_in_window && window.elems.is_empty() {
+                // All done!
+                break;
+            }
+            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
+            {
+                let batch_layers: Vec<LayerId> = job
+                    .input_layers
+                    .iter()
+                    .filter(|layer_id| {
+                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                    })
+                    .cloned()
+                    .collect();
+                assert!(!batch_layers.is_empty());
+                new_jobs.push(CompactionJob {
+                    key_range,
+                    lsn_range: job.lsn_range.clone(),
+                    strategy: CompactionStrategy::CreateDelta,
+                    input_layers: batch_layers,
+                    completed: false,
+                });
+            } else {
+                assert!(!all_in_window);
+                if let Some(next_key) = key_accum.next().await.transpose()? {
+                    window.feed(next_key.key, next_key.size);
+                } else {
+                    all_in_window = true;
+                }
+            }
+        }
+
+        // All the input files are rewritten. Set up the tracking for when they can
+        // be deleted.
+        for layer_id in job.input_layers.iter() {
+            let l = &mut self.layers[layer_id.0];
+            assert!(l.deletable_after.is_none());
+            l.deletable_after = Some(PendingJobSet::new());
+        }
+        for j in new_jobs.into_iter().rev() {
+            let job_id = self.push_job(j);
+            let j = &self.jobs[job_id.0];
+            for layer_id in j.input_layers.iter() {
+                self.layers[layer_id.0]
+                    .deletable_after
+                    .as_mut()
+                    .unwrap()
+                    .pending
+                    .insert(job_id);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+// Sliding window through keyspace and values
+// This is used by over_with_images to decide on good split points
+struct KeyspaceWindow<K> {
+    head: KeyspaceWindowHead<K>,
+
+    start_pos: KeyspaceWindowPos<K>,
+}
+struct KeyspaceWindowHead<K> {
+    // overall key range to cover
+    key_range: Range<K>,
+
+    keyspace: Vec<Range<K>>,
+    target_keysize: u64,
+}
+
+#[derive(Clone)]
+struct KeyspaceWindowPos<K> {
+    end_key: K,
+
+    keyspace_idx: usize,
+
+    accum_keysize: u64,
+}
+impl<K: CompactionKey> KeyspaceWindowPos<K> {
+    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
+        self.keyspace_idx == w.keyspace.len()
+    }
+
+    // Advance the cursor until it reaches 'target_keysize'.
+    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+        while self.accum_keysize < max_size && !self.reached_end(w) {
+            let curr_range = &w.keyspace[self.keyspace_idx];
+            if self.end_key < curr_range.start {
+                // skip over any unused space
+                self.end_key = curr_range.start;
+            }
+
+            // We're now within 'curr_range'. Can we advance past it completely?
+            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            if (self.accum_keysize + distance as u64) < max_size {
+                // oh yeah, it fits
+                self.end_key = curr_range.end;
+                self.keyspace_idx += 1;
+                self.accum_keysize += distance as u64;
+            } else {
+                // advance within the range
+                let skip_key = self.end_key.skip_some();
+                let distance = K::key_range_size(&(self.end_key..skip_key));
+                if (self.accum_keysize + distance as u64) < max_size {
+                    self.end_key = skip_key;
+                    self.accum_keysize += distance as u64;
+                } else {
+                    self.end_key = self.end_key.next();
+                    self.accum_keysize += 1;
+                }
+            }
+        }
+    }
+}
+
+impl<K> KeyspaceWindow<K>
+where
+    K: CompactionKey,
+{
+    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
+        assert!(keyspace.first().unwrap().start >= key_range.start);
+
+        let start_key = key_range.start;
+        let start_pos = KeyspaceWindowPos::<K> {
+            end_key: start_key,
+            keyspace_idx: 0,
+            accum_keysize: 0,
+        };
+        Self {
+            head: KeyspaceWindowHead::<K> {
+                key_range,
+                keyspace,
+                target_keysize,
+            },
+            start_pos,
+        }
+    }
+
+    fn choose_next_image(&mut self) -> Option<Range<K>> {
+        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
+            // we've reached the end
+            return None;
+        }
+
+        let mut next_pos = self.start_pos.clone();
+        next_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + self.head.target_keysize,
+        );
+
+        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
+        // 1.25x target size
+        let mut end_pos = next_pos.clone();
+        end_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
+        );
+        if end_pos.reached_end(&self.head) {
+            // gobble up any unused keyspace between the last used key and end of the range
+            assert!(end_pos.end_key <= self.head.key_range.end);
+            end_pos.end_key = self.head.key_range.end;
+            next_pos = end_pos;
+        }
+
+        let start_key = self.start_pos.end_key;
+        self.start_pos = next_pos;
+        Some(start_key..self.start_pos.end_key)
+    }
+}
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
+//
+// Candidates:
+//
+// 1. Create an image layer, snapping to previous images
+// 2. Create a delta layer, snapping to previous images
+// 3. Create an image layer, snapping to
+//
+//
+
+// Take previous partitioning, based on the image layers below.
+//
+// Candidate is at the front:
+//
+// Consider stretching an image layer to next divider? If it's close enough,
+// that's the image candidate
+//
+// If it's too far, consider splitting at a reasonable point
+//
+// Is the image candidate smaller than the equivalent delta? If so,
+// split off the image. Otherwise, split off one delta.
+// Try to snap off the delta at a reasonable point
+
+struct WindowElement<K> {
+    start_key: K, // inclusive
+    last_key: K,  // inclusive
+    accum_size: u64,
+}
+struct Window<K> {
+    elems: VecDeque<WindowElement<K>>,
+
+    // last key that was split off, inclusive
+    splitoff_key: Option<K>,
+    splitoff_size: u64,
+}
+
+impl<K> Window<K>
+where
+    K: CompactionKey,
+{
+    fn new() -> Self {
+        Self {
+            elems: VecDeque::new(),
+            splitoff_key: None,
+            splitoff_size: 0,
+        }
+    }
+
+    fn feed(&mut self, key: K, size: u64) {
+        let last_size;
+        if let Some(last) = self.elems.back_mut() {
+            assert!(last.last_key <= key);
+            if key == last.last_key {
+                last.accum_size += size;
+                return;
+            }
+            last_size = last.accum_size;
+        } else {
+            last_size = 0;
+        }
+        // This is a new key.
+        let elem = WindowElement {
+            start_key: key,
+            last_key: key,
+            accum_size: last_size + size,
+        };
+        self.elems.push_back(elem);
+    }
+
+    fn remain_size(&self) -> u64 {
+        self.elems.back().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn peek_size(&self) -> u64 {
+        self.elems.front().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn commit_upto(&mut self, mut upto: usize) {
+        while upto > 1 {
+            let popped = self.elems.pop_front().unwrap();
+            self.elems.front_mut().unwrap().start_key = popped.start_key;
+            upto -= 1;
+        }
+    }
+
+    fn find_size_split(&self, target_size: u64) -> usize {
+        self.elems
+            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
+    }
+
+    fn pop(&mut self) {
+        let first = self.elems.pop_front().unwrap();
+        self.splitoff_size = first.accum_size;
+
+        self.splitoff_key = Some(first.last_key);
+    }
+
+    // the difference between delta and image is that an image covers
+    // any unused keyspace before and after, while a delta tries to
+    // minimize that. TODO: difference not implemented
+    fn pop_delta(&mut self) -> Range<K> {
+        let first = self.elems.front().unwrap();
+        let key_range = first.start_key..first.last_key.next();
+
+        self.pop();
+        key_range
+    }
+
+    // Prerequisite: we have enough input in the window
+    //
+    // On return None, the caller should feed more data and call again
+    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
+        if has_more && self.elems.is_empty() {
+            // Starting up
+            return None;
+        }
+
+        // If we still have an undersized candidate, just keep going
+        while self.peek_size() < target_size {
+            if self.elems.len() > 1 {
+                self.commit_upto(2);
+            } else if has_more {
+                return None;
+            } else {
+                break;
+            }
+        }
+
+        // Ensure we have enough input in the window to make a good decision
+        if has_more && self.remain_size() < target_size * 5 / 4 {
+            return None;
+        }
+
+        // The candidate on the front is now large enough, for a delta.
+        // And we have enough data in the window to decide.
+
+        // If we're willing to stretch it up to 1.25 target size, could we
+        // gobble up the rest of the work? This avoids creating very small
+        // "tail" layers at the end of the keyspace
+        if !has_more && self.remain_size() < target_size * 5 / 3 {
+            self.commit_upto(self.elems.len());
+        } else {
+            let delta_split_at = self.find_size_split(target_size);
+            self.commit_upto(delta_split_at);
+
+            // If it's still not large enough, request the caller to fill the window
+            if self.elems.len() == 1 && has_more {
+                return None;
+            }
+        }
+        Some(self.pop_delta())
+    }
+}
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -0,0 +1,242 @@
+//! This file contains generic utility functions over the interface types,
+//! which could be handy for any compaction implementation.
+use crate::interface::*;
+
+use futures::future::BoxFuture;
+use futures::{Stream, StreamExt};
+use itertools::Itertools;
+use pin_project_lite::pin_project;
+use std::collections::BinaryHeap;
+use std::collections::VecDeque;
+use std::future::Future;
+use std::ops::{DerefMut, Range};
+use std::pin::Pin;
+use std::task::{ready, Poll};
+
+pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+where
+    K: CompactionKey,
+{
+    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+}
+
+pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    !(a.end <= b.start || b.end <= a.start)
+}
+
+pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
+    let x = std::mem::take(a);
+    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
+        .into_iter()
+        .kmerge_by(|a, b| a.start < b.start);
+    let mut ranges = Vec::new();
+    if let Some(first) = all_ranges_iter.next() {
+        let (mut start, mut end) = (first.start, first.end);
+
+        for r in all_ranges_iter {
+            assert!(r.start >= start);
+            if r.start > end {
+                ranges.push(start..end);
+                start = r.start;
+                end = r.end;
+            } else if r.end > end {
+                end = r.end;
+            }
+        }
+        ranges.push(start..end);
+    }
+    *a = ranges
+}
+
+pub fn intersect_keyspace<K: Ord + Clone + Copy>(
+    a: &CompactionKeySpace<K>,
+    r: &Range<K>,
+) -> CompactionKeySpace<K> {
+    let mut ranges: Vec<Range<K>> = Vec::new();
+
+    for x in a.iter() {
+        if x.end <= r.start {
+            continue;
+        }
+        if x.start >= r.end {
+            break;
+        }
+        ranges.push(x.clone())
+    }
+
+    // trim the ends
+    if let Some(first) = ranges.first_mut() {
+        first.start = std::cmp::max(first.start, r.start);
+    }
+    if let Some(last) = ranges.last_mut() {
+        last.end = std::cmp::min(last.end, r.end);
+    }
+    ranges
+}
+
+/// Create a stream that iterates through all DeltaEntrys among all input
+/// layers, in key-lsn order.
+///
+/// This is public because the create_delta() implementation likely wants to use this too
+/// TODO: move to a more shared place
+pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
+    layers: &'a [E::DeltaLayer],
+    ctx: &'a E::RequestContext,
+) -> MergeDeltaKeys<'a, E> {
+    // Use a binary heap to merge the layers. Each input layer is initially
+    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
+    // the layer's key range as the key. The first time a layer reaches the top
+    // of the heap, all the keys of the layer are loaded into a sorted vector.
+    //
+    // This helps to keep the memory usage reasonable: we only need to hold in
+    // memory the DeltaEntrys of the layers that overlap with the "current" key.
+    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
+    for l in layers {
+        heap.push(LazyLoadLayer::Unloaded(l));
+    }
+    MergeDeltaKeys {
+        heap,
+        ctx,
+        load_future: None,
+    }
+}
+
+enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
+    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
+    Unloaded(&'a E::DeltaLayer),
+}
+impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
+    fn key(&self) -> E::Key {
+        match self {
+            Self::Loaded(entries) => entries.front().unwrap().key(),
+            Self::Unloaded(dl) => dl.key_range().start,
+        }
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // reverse order so that we get a min-heap
+        other.key().cmp(&self.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+    fn eq(&self, other: &Self) -> bool {
+        self.key().eq(&other.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
+
+type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
+
+// Stream returned by `merge_delta_keys`
+pin_project! {
+#[allow(clippy::type_complexity)]
+pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
+    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
+
+    #[pin]
+    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
+
+    ctx: &'a E::RequestContext,
+}
+}
+
+impl<'a, E> Stream for MergeDeltaKeys<'a, E>
+where
+    E: CompactionJobExecutor + 'a,
+{
+    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
+
+    fn poll_next(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
+        let mut this = self.project();
+        loop {
+            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
+                // We are waiting for loading the keys to finish
+                match ready!(load_future.as_mut().poll(cx)) {
+                    Ok(entries) => {
+                        this.load_future.set(None);
+                        *this.heap.peek_mut().unwrap() =
+                            LazyLoadLayer::Loaded(VecDeque::from(entries));
+                    }
+                    Err(e) => {
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                }
+            }
+
+            // If the topmost layer in the heap hasn't been loaded yet, start
+            // loading it. Otherwise return the next entry from it and update
+            // the layer's position in the heap (this decreaseKey operation is
+            // performed implicitly when `top` is dropped).
+            if let Some(mut top) = this.heap.peek_mut() {
+                match top.deref_mut() {
+                    LazyLoadLayer::Unloaded(ref mut l) => {
+                        let fut = l.load_keys(this.ctx);
+                        this.load_future.set(Some(fut));
+                        continue;
+                    }
+                    LazyLoadLayer::Loaded(ref mut entries) => {
+                        let result = entries.pop_front().unwrap();
+                        if entries.is_empty() {
+                            std::collections::binary_heap::PeekMut::pop(top);
+                        }
+                        return Poll::Ready(Some(Ok(result)));
+                    }
+                }
+            } else {
+                return Poll::Ready(None);
+            }
+        }
+    }
+}
+
+// Accumulate values at key boundaries
+pub struct KeySize<K> {
+    pub key: K,
+    pub num_values: u64,
+    pub size: u64,
+}
+
+pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+where
+    K: Eq,
+    I: Stream<Item = Result<D, E>>,
+    D: CompactionDeltaEntry<'a, K>,
+{
+    async_stream::try_stream! {
+        // Initialize the state from the first value
+        let mut input = std::pin::pin!(input);
+
+        if let Some(first) = input.next().await {
+            let first = first?;
+            let mut accum: KeySize<K> = KeySize {
+                key: first.key(),
+                num_values: 1,
+                size: first.size(),
+            };
+            while let Some(this) = input.next().await {
+                let this = this?;
+                if this.key() == accum.key {
+                    accum.size += this.size();
+                    accum.num_values += 1;
+                } else {
+                    yield accum;
+                    accum = KeySize {
+                        key: this.key(),
+                        num_values: 1,
+                        size: this.size(),
+                    };
+                }
+            }
+            yield accum;
+        }
+    }
+}
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -0,0 +1,376 @@
+//! An LSM tree consists of multiple levels, each exponential larger than the
+//! previous level. And each level consists of be multiple "tiers". With tiered
+//! compaction, a level is compacted when it has accumulated more than N tiers,
+//! forming one tier on the next level.
+//!
+//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
+//! we identify them by looking at the shapes of the layers. It's an easy task
+//! for a human, but it's not straightforward to come up with the exact
+//! rules. Especially if there are cases like interrupted, half-finished
+//! compactions, or highly skewed data distributions that have let us "skip"
+//! some levels. It's not critical to classify all cases correctly; at worst we
+//! delay some compaction work, and suffer from more read amplification, or we
+//! perform some unnecessary compaction work.
+//!
+//! `identify_level` performs that shape-matching.
+//!
+//! It returns a Level struct, which has `depth()` function to count the number
+//! of "tiers" in the level. The tier count is the max depth of stacked layers
+//! within the level. That's a good measure, because the point of compacting is
+//! to reduce read amplification, and the depth is what determines that.
+//!
+//! One interesting effect of this is that if we generate very small delta
+//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
+//! because they reach the target size, the L0 compaction will combine them to
+//! one larger file. But if the combined file is still smaller than the target
+//! file size, the file will still be considered to be part of L0 at the next
+//! iteration.
+
+use anyhow::bail;
+use std::collections::BTreeSet;
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+use crate::interface::*;
+
+use tracing::{info, trace};
+
+pub struct Level<L> {
+    pub lsn_range: Range<Lsn>,
+    pub layers: Vec<L>,
+}
+
+/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
+/// no layers that cross the boundary LSN.
+///
+/// A further restriction is that all layers in the returned partition cover at
+/// most 'lsn_max_size' LSN bytes.
+pub async fn identify_level<K, L>(
+    all_layers: Vec<L>,
+    end_lsn: Lsn,
+    lsn_max_size: u64,
+) -> anyhow::Result<Option<Level<L>>>
+where
+    K: CompactionKey,
+    L: CompactionLayer<K> + Clone,
+{
+    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
+    let mut layers = Vec::new();
+    for l in all_layers {
+        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
+            // shouldn't happen. Indicates that the caller passed a bogus
+            // end_lsn.
+            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
+        }
+        // include image layers sitting exacty at `end_lsn`.
+        let is_image = !l.is_delta();
+        if (is_image && l.lsn_range().start > end_lsn)
+            || (!is_image && l.lsn_range().start >= end_lsn)
+        {
+            continue;
+        }
+        layers.push(l);
+    }
+    // All the remaining layers either belong to this level, or are below it.
+    info!(
+        "identify level at {}, size {}, num layers below: {}",
+        end_lsn,
+        lsn_max_size,
+        layers.len()
+    );
+    if layers.is_empty() {
+        return Ok(None);
+    }
+
+    // Walk the ranges in LSN order.
+    //
+    // ----- end_lsn
+    //  |
+    //  |
+    //  v
+    //
+    layers.sort_by_key(|l| l.lsn_range().end);
+    let mut candidate_start_lsn = end_lsn;
+    let mut candidate_layers: Vec<L> = Vec::new();
+    let mut current_best_start_lsn = end_lsn;
+    let mut current_best_layers: Vec<L> = Vec::new();
+    let mut iter = layers.into_iter();
+    loop {
+        let Some(l) = iter.next_back() else {
+            // Reached end. Accept the last candidate
+            current_best_start_lsn = candidate_start_lsn;
+            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+            break;
+        };
+        trace!(
+            "inspecting {} for candidate {}, current best {}",
+            l.short_id(),
+            candidate_start_lsn,
+            current_best_start_lsn
+        );
+
+        let r = l.lsn_range();
+
+        // Image layers don't restrict our choice of cutoff LSN
+        if l.is_delta() {
+            // Is this candidate workable? In other words, are there any
+            // delta layers that span across this LSN
+            //
+            // Valid:                 Not valid:
+            //  +                     +
+            //  |                     | +
+            //  +  <- candidate       + |   <- candidate
+            //     +                    +
+            //     |
+            //     +
+            if r.end <= candidate_start_lsn {
+                // Hooray, there are no crossing LSNs. And we have visited
+                // through all the layers within candidate..end_lsn. The
+                // current candidate can be accepted.
+                current_best_start_lsn = r.end;
+                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+                candidate_start_lsn = r.start;
+            }
+
+            // Is it small enough to be considered part of this level?
+            if r.end.0 - r.start.0 > lsn_max_size {
+                // Too large, this layer belongs to next level. Stop.
+                trace!(
+                    "too large {}, size {} vs {}",
+                    l.short_id(),
+                    r.end.0 - r.start.0,
+                    lsn_max_size
+                );
+                break;
+            }
+
+            // If this crosses the candidate lsn, push it down.
+            if r.start < candidate_start_lsn {
+                trace!(
+                    "layer {} prevents from stopping at {}",
+                    l.short_id(),
+                    candidate_start_lsn
+                );
+                candidate_start_lsn = r.start;
+            }
+        }
+
+        // Include this layer in our candidate
+        candidate_layers.push(l);
+    }
+
+    Ok(if current_best_start_lsn == end_lsn {
+        // empty level
+        None
+    } else {
+        Some(Level {
+            lsn_range: current_best_start_lsn..end_lsn,
+            layers: current_best_layers,
+        })
+    })
+}
+
+// helper struct used in depth()
+struct Event<K> {
+    key: K,
+    layer_idx: usize,
+    start: bool,
+}
+
+impl<L> Level<L> {
+    /// Count the number of deltas stacked on each other.
+    pub fn depth<K>(&self) -> u64
+    where
+        K: CompactionKey,
+        L: CompactionLayer<K>,
+    {
+        let mut events: Vec<Event<K>> = Vec::new();
+        for (idx, l) in self.layers.iter().enumerate() {
+            events.push(Event {
+                key: l.key_range().start,
+                layer_idx: idx,
+                start: true,
+            });
+            events.push(Event {
+                key: l.key_range().end,
+                layer_idx: idx,
+                start: false,
+            });
+        }
+        events.sort_by_key(|e| (e.key, e.start));
+
+        // Sweep the key space left to right. Stop at each distinct key, and
+        // count the number of deltas on top of the highest image at that key.
+        //
+        // This is a little enefficient, as we walk through the active_set on
+        // every key. We could increment/decrement a counter on each step
+        // instead, but that'd require a bit more complex bookkeeping.
+        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
+        let mut max_depth = 0;
+        let mut events_iter = events.iter().peekable();
+        while let Some(e) = events_iter.next() {
+            let l = &self.layers[e.layer_idx];
+            let is_image = !l.is_delta();
+
+            // update the active set
+            if e.start {
+                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
+            } else {
+                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
+            }
+
+            // recalculate depth if this was the last event at this point
+            let more_events_at_this_key = events_iter
+                .peek()
+                .map_or(false, |next_e| next_e.key == e.key);
+            if !more_events_at_this_key {
+                let mut active_depth = 0;
+                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
+                    if *is_image {
+                        break;
+                    }
+                    active_depth += 1;
+                }
+                if active_depth > max_depth {
+                    max_depth = active_depth;
+                }
+            }
+        }
+        max_depth
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
+    use std::sync::{Arc, Mutex};
+
+    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
+        MockLayer::Delta(Arc::new(MockDeltaLayer {
+            key_range,
+            lsn_range,
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+            records: vec![],
+        }))
+    }
+
+    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
+        MockLayer::Image(Arc::new(MockImageLayer {
+            key_range,
+            lsn_range: lsn..(lsn + 1),
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+        }))
+    }
+
+    #[tokio::test]
+    async fn test_identify_level() -> anyhow::Result<()> {
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
+        ];
+
+        // All layers fit in the max file size
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 6);
+
+        // Same LSN with smaller max file size. The second layer from the top is larger
+        // and belongs to next level.
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        // Call with a smaller LSN
+        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 2);
+
+        // Call with an LSN that doesn't partition the space
+        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
+        // The files LSN ranges overlap, so even though there are more files that
+        // fit under the file size, they are not included in the level because they
+        // overlap so that we'd need to include the oldest file, too, which is
+        // larger
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
+        // The key ranges don't overlap, so depth is only 1.
+        let layers = vec![
+            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
+            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 1);
+
+        // Staggered. The 1st and 3rd layer don't overlap with each other.
+        let layers = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 2);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_images() -> anyhow::Result<()> {
+        let layers: Vec<MockLayer> = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+            // This covers the same key range as the 2nd delta layer. The depth
+            // in that key range is therefore 0.
+            image(1500..2500, Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 4);
+        assert_eq!(level.depth(), 1);
+        Ok(())
+    }
+}
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -0,0 +1,167 @@
+//! This is what the compaction implementation needs to know about
+//! layers, keyspace etc.
+//!
+//! All the heavy lifting is done by the create_image and create_delta
+//! functions that the implementor provides.
+use async_trait::async_trait;
+use pageserver_api::{key::Key, keyspace::key_range_size};
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+/// Public interface. This is the main thing that the implementor needs to provide
+#[async_trait]
+pub trait CompactionJobExecutor {
+    // Type system.
+    //
+    // We assume that there are two kinds of layers, deltas and images. The
+    // compaction doesn't distinguish whether they are stored locally or
+    // remotely.
+    //
+    // The keyspace is defined by CompactionKey trait.
+    //
+    type Key: CompactionKey;
+
+    type Layer: CompactionLayer<Self::Key> + Clone;
+    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
+    type ImageLayer: CompactionImageLayer<Self> + Clone;
+
+    // This is passed through to all the interface functions. The compaction
+    // implementation doesn't do anything with it, but it might be useful for
+    // the interface implementation.
+    type RequestContext: CompactionRequestContext;
+
+    // ----
+    // Functions that the planner uses to support its decisions
+    // ----
+
+    /// Return all layers that overlap the given bounding box.
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>>;
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn: Lsn,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+
+    /// NB: This is a pretty expensive operation. In the real pageserver
+    /// implementation, it downloads the layer, and keeps it resident
+    /// until the DeltaLayer is dropped.
+    async fn downcast_delta_layer(
+        &self,
+        layer: &Self::Layer,
+    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+
+    // ----
+    // Functions to execute the plan
+    // ----
+
+    /// Create a new image layer, materializing all the values in the key range,
+    /// at given 'lsn'.
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Self::Key>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Create a new delta layer, containing all the values from 'input_layers'
+    /// in the given key and LSN range.
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Self::Key>,
+        input_layers: &[Self::DeltaLayer],
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Delete a layer. The compaction implementation will call this only after
+    /// all the create_image() or create_delta() calls that deletion of this
+    /// layer depends on have finished. But if the implementor has extra lazy
+    /// background tasks, like uploading the index json file to remote storage,
+    /// it is the implementation's responsibility to track those.
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+}
+
+pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
+    const MIN: Self;
+    const MAX: Self;
+
+    /// Calculate distance between key_range.start and key_range.end.
+    ///
+    /// This returns u32, for compatibility with Repository::key. If the
+    /// distance is larger, return u32::MAX.
+    fn key_range_size(key_range: &Range<Self>) -> u32;
+
+    // return "self + 1"
+    fn next(&self) -> Self;
+
+    // return "self + <some decent amount to skip>". The amount to skip
+    // is left to the implementation.
+    // FIXME: why not just "add(u32)" ?  This is hard to use
+    fn skip_some(&self) -> Self;
+}
+
+impl CompactionKey for Key {
+    const MIN: Self = Self::MIN;
+    const MAX: Self = Self::MAX;
+
+    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
+        key_range_size(r)
+    }
+    fn next(&self) -> Key {
+        (self as &Key).next()
+    }
+    fn skip_some(&self) -> Key {
+        self.add(128)
+    }
+}
+
+/// Contiguous ranges of keys that belong to the key space. In key order, and
+/// with no overlap.
+pub type CompactionKeySpace<K> = Vec<Range<K>>;
+
+/// Functions needed from all layers.
+pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+    fn key_range(&self) -> &Range<K>;
+    fn lsn_range(&self) -> &Range<Lsn>;
+
+    fn file_size(&self) -> u64;
+
+    /// For debugging, short human-readable representation of the layer. E.g. filename.
+    fn short_id(&self) -> String;
+
+    fn is_delta(&self) -> bool;
+}
+
+#[async_trait]
+pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
+    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
+    where
+        Self: 'a;
+
+    /// Return all keys in this delta layer.
+    async fn load_keys<'a>(
+        &self,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
+}
+
+pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
+
+pub trait CompactionDeltaEntry<'a, K> {
+    fn key(&self) -> K;
+    fn lsn(&self) -> Lsn;
+    fn size(&self) -> u64;
+}
+
+pub trait CompactionRequestContext {}
--- a/pageserver/compaction/src/lib.rs
+++ b/pageserver/compaction/src/lib.rs
@@ -0,0 +1,12 @@
+// The main module implementing the compaction algorithm
+pub mod compact_tiered;
+pub(crate) mod identify_levels;
+
+// Traits that the caller of the compaction needs to implement
+pub mod interface;
+
+// Utility functions, useful for the implementation
+pub mod helpers;
+
+// A simulator with mock implementations of 'interface'
+pub mod simulator;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -0,0 +1,613 @@
+mod draw;
+
+use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
+
+use async_trait::async_trait;
+use futures::StreamExt;
+use rand::Rng;
+use tracing::info;
+
+use utils::lsn::Lsn;
+
+use std::fmt::Write;
+use std::ops::Range;
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::helpers::{merge_delta_keys, overlaps_with};
+
+use crate::interface;
+use crate::interface::CompactionLayer;
+
+//
+// Implementation for the CompactionExecutor interface
+//
+pub struct MockTimeline {
+    // Parameters for the compaction algorithm
+    pub target_file_size: u64,
+    tiers_per_level: u64,
+
+    num_l0_flushes: u64,
+    last_compact_at_flush: u64,
+    last_flush_lsn: Lsn,
+
+    // In-memory layer
+    records: Vec<MockRecord>,
+    total_len: u64,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+
+    // Current keyspace at `end_lsn`. This is updated on every ingested record.
+    keyspace: KeySpace,
+
+    // historic keyspaces
+    old_keyspaces: Vec<(Lsn, KeySpace)>,
+
+    // "on-disk" layers
+    pub live_layers: Vec<MockLayer>,
+
+    num_deleted_layers: u64,
+
+    // Statistics
+    wal_ingested: u64,
+    bytes_written: u64,
+    bytes_deleted: u64,
+    layers_created: u64,
+    layers_deleted: u64,
+
+    // All the events - creation and deletion of files - are collected
+    // in 'history'. It is used to draw the SVG animation at the end.
+    time: u64,
+    history: Vec<draw::LayerTraceEvent>,
+}
+
+type KeySpace = interface::CompactionKeySpace<Key>;
+
+pub struct MockRequestContext {}
+impl interface::CompactionRequestContext for MockRequestContext {}
+
+pub type Key = u64;
+
+impl interface::CompactionKey for Key {
+    const MIN: Self = u64::MIN;
+    const MAX: Self = u64::MAX;
+
+    fn key_range_size(key_range: &Range<Self>) -> u32 {
+        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
+    }
+
+    fn next(&self) -> Self {
+        self + 1
+    }
+    fn skip_some(&self) -> Self {
+        // round up to next xx
+        self + 100
+    }
+}
+
+#[derive(Clone)]
+pub struct MockRecord {
+    lsn: Lsn,
+    key: Key,
+    len: u64,
+}
+
+impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
+    fn key(&self) -> Key {
+        self.key
+    }
+    fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+    fn size(&self) -> u64 {
+        self.len
+    }
+}
+
+pub struct MockDeltaLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+
+    pub records: Vec<MockRecord>,
+}
+
+impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}-{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+#[async_trait]
+impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
+    type DeltaEntry<'a> = MockRecord;
+
+    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
+        Ok(self.records.clone())
+    }
+}
+
+pub struct MockImageLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+}
+
+impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
+
+impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        false
+    }
+}
+
+impl MockTimeline {
+    pub fn new() -> Self {
+        MockTimeline {
+            target_file_size: 256 * 1024 * 1024,
+            tiers_per_level: 4,
+
+            num_l0_flushes: 0,
+            last_compact_at_flush: 0,
+            last_flush_lsn: Lsn(0),
+
+            records: Vec::new(),
+            total_len: 0,
+            start_lsn: Lsn(1000),
+            end_lsn: Lsn(1000),
+            keyspace: KeySpace::new(),
+
+            old_keyspaces: vec![],
+
+            live_layers: vec![],
+
+            num_deleted_layers: 0,
+
+            wal_ingested: 0,
+            bytes_written: 0,
+            bytes_deleted: 0,
+            layers_created: 0,
+            layers_deleted: 0,
+
+            time: 0,
+            history: Vec::new(),
+        }
+    }
+
+    pub async fn compact(&mut self) -> anyhow::Result<()> {
+        let ctx = MockRequestContext {};
+
+        crate::compact_tiered::compact_tiered(
+            self,
+            self.last_flush_lsn,
+            self.target_file_size,
+            self.tiers_per_level,
+            &ctx,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    // Ingest one record to the timeline
+    pub fn ingest_record(&mut self, key: Key, len: u64) {
+        self.records.push(MockRecord {
+            lsn: self.end_lsn,
+            key,
+            len,
+        });
+        self.total_len += len;
+        self.end_lsn += len;
+
+        if self.total_len > self.target_file_size {
+            self.flush_l0();
+        }
+    }
+
+    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
+        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
+            self.compact().await?;
+            self.last_compact_at_flush = self.num_l0_flushes;
+        }
+        Ok(())
+    }
+
+    pub fn flush_l0(&mut self) {
+        if self.records.is_empty() {
+            return;
+        }
+
+        let mut records = std::mem::take(&mut self.records);
+        records.sort_by_key(|rec| rec.key);
+
+        let lsn_range = self.start_lsn..self.end_lsn;
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: Key::MIN..Key::MAX,
+            lsn_range: lsn_range.clone(),
+            file_size: self.total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!("flushed L0 layer {}", new_layer.short_id());
+        self.live_layers.push(MockLayer::from(&new_layer));
+
+        // reset L0
+        self.start_lsn = self.end_lsn;
+        self.total_len = 0;
+        self.records = Vec::new();
+
+        self.layers_created += 1;
+        self.bytes_written += new_layer.file_size;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Flush,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        self.num_l0_flushes += 1;
+        self.last_flush_lsn = self.end_lsn;
+    }
+
+    // Ingest `num_records' records to the timeline, with random keys
+    // uniformly distributed in `key_range`
+    pub fn ingest_uniform(
+        &mut self,
+        num_records: u64,
+        len: u64,
+        key_range: &Range<Key>,
+    ) -> anyhow::Result<()> {
+        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
+        let mut rng = rand::thread_rng();
+        for _ in 0..num_records {
+            self.ingest_record(rng.gen_range(key_range.clone()), len);
+            self.wal_ingested += len;
+        }
+        Ok(())
+    }
+
+    pub fn stats(&self) -> anyhow::Result<String> {
+        let mut s = String::new();
+
+        writeln!(s, "STATISTICS:")?;
+        writeln!(
+            s,
+            "WAL ingested:   {:>10} MB",
+            self.wal_ingested / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size created:   {:>10} MB",
+            self.bytes_written / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size deleted:   {:>10} MB",
+            self.bytes_deleted / (1024 * 1024)
+        )?;
+        writeln!(s, "files created:     {:>10}", self.layers_created)?;
+        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
+        writeln!(
+            s,
+            "write amp:         {:>10.2}",
+            self.bytes_written as f64 / self.wal_ingested as f64
+        )?;
+        writeln!(
+            s,
+            "storage amp:       {:>10.2}",
+            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
+        )?;
+
+        Ok(s)
+    }
+
+    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
+        draw::draw_history(&self.history, output)
+    }
+}
+
+impl Default for MockTimeline {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Clone)]
+pub enum MockLayer {
+    Delta(Arc<MockDeltaLayer>),
+    Image(Arc<MockImageLayer>),
+}
+
+impl interface::CompactionLayer<Key> for MockLayer {
+    fn key_range(&self) -> &Range<Key> {
+        match self {
+            MockLayer::Delta(this) => this.key_range(),
+            MockLayer::Image(this) => this.key_range(),
+        }
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        match self {
+            MockLayer::Delta(this) => this.lsn_range(),
+            MockLayer::Image(this) => this.lsn_range(),
+        }
+    }
+    fn file_size(&self) -> u64 {
+        match self {
+            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size(),
+        }
+    }
+    fn short_id(&self) -> String {
+        match self {
+            MockLayer::Delta(this) => this.short_id(),
+            MockLayer::Image(this) => this.short_id(),
+        }
+    }
+
+    fn is_delta(&self) -> bool {
+        match self {
+            MockLayer::Delta(_) => true,
+            MockLayer::Image(_) => false,
+        }
+    }
+}
+
+impl MockLayer {
+    fn is_deleted(&self) -> bool {
+        let guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        *guard
+    }
+    fn mark_deleted(&self) {
+        let mut deleted_guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        assert!(!*deleted_guard, "layer already deleted");
+        *deleted_guard = true;
+    }
+}
+
+impl From<&Arc<MockDeltaLayer>> for MockLayer {
+    fn from(l: &Arc<MockDeltaLayer>) -> Self {
+        MockLayer::Delta(l.clone())
+    }
+}
+
+impl From<&Arc<MockImageLayer>> for MockLayer {
+    fn from(l: &Arc<MockImageLayer>) -> Self {
+        MockLayer::Image(l.clone())
+    }
+}
+
+#[async_trait]
+impl interface::CompactionJobExecutor for MockTimeline {
+    type Key = Key;
+    type Layer = MockLayer;
+    type DeltaLayer = Arc<MockDeltaLayer>;
+    type ImageLayer = Arc<MockImageLayer>;
+    type RequestContext = MockRequestContext;
+
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>> {
+        // Clear any deleted layers from our vec
+        self.live_layers.retain(|l| !l.is_deleted());
+
+        let layers: Vec<MockLayer> = self
+            .live_layers
+            .iter()
+            .filter(|l| {
+                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
+            })
+            .cloned()
+            .collect();
+
+        Ok(layers)
+    }
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        _lsn: Lsn,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
+        // find it in the levels
+        if self.old_keyspaces.is_empty() {
+            Ok(crate::helpers::intersect_keyspace(
+                &self.keyspace,
+                key_range,
+            ))
+        } else {
+            // not implemented
+
+            // The mock implementation only allows requesting the
+            // keyspace at the level's end LSN. That's all that the
+            // current implementation needs.
+            panic!("keyspace not available for requested lsn");
+        }
+    }
+
+    async fn downcast_delta_layer(
+        &self,
+        layer: &MockLayer,
+    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
+        Ok(match layer {
+            MockLayer::Delta(l) => Some(l.clone()),
+            MockLayer::Image(_) => None,
+        })
+    }
+
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
+
+        let mut accum_size: u64 = 0;
+        for r in keyspace {
+            accum_size += r.end - r.start;
+        }
+
+        let new_layer = Arc::new(MockImageLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn..lsn,
+            file_size: accum_size * 8192,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created image layer, size {}: {}",
+            new_layer.file_size,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Image(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += new_layer.file_size;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateImage,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Key>,
+        input_layers: &[Arc<MockDeltaLayer>],
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let mut key_value_stream =
+            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
+        let mut records: Vec<MockRecord> = Vec::new();
+        let mut total_len = 2;
+        while let Some(delta_entry) = key_value_stream.next().await {
+            let delta_entry: MockRecord = delta_entry?;
+            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
+                total_len += delta_entry.len;
+                records.push(delta_entry);
+            }
+        }
+        let total_records = records.len();
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn_range.clone(),
+            file_size: total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created delta layer, recs {}, size {}: {}",
+            total_records,
+            total_len,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += total_len;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateDelta,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        _ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let layer = std::pin::pin!(layer);
+        info!("deleting layer: {}", layer.short_id());
+        self.num_deleted_layers += 1;
+        self.bytes_deleted += layer.file_size();
+        layer.mark_deleted();
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Delete,
+            file: LayerTraceFile {
+                filename: layer.short_id(),
+                key_range: layer.key_range().clone(),
+                lsn_range: layer.lsn_range().clone(),
+            },
+        });
+
+        Ok(())
+    }
+}
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -0,0 +1,411 @@
+use super::Key;
+use anyhow::Result;
+use std::cmp::Ordering;
+use std::{
+    collections::{BTreeMap, BTreeSet, HashSet},
+    fmt::Write,
+    ops::Range,
+};
+use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
+use utils::lsn::Lsn;
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+struct CoordinateMap<T: Ord + Copy> {
+    map: BTreeMap<T, usize>,
+    stretch: f32,
+}
+
+impl<T: Ord + Copy> CoordinateMap<T> {
+    fn new(coords: Vec<T>, stretch: f32) -> Self {
+        let set: BTreeSet<T> = coords.into_iter().collect();
+
+        let mut map: BTreeMap<T, usize> = BTreeMap::new();
+        for (i, e) in set.iter().enumerate() {
+            map.insert(*e, i);
+        }
+
+        Self { map, stretch }
+    }
+
+    // This assumes that the map contains an exact point for this.
+    // Use map_inexact for values inbetween
+    fn map(&self, val: T) -> f32 {
+        *self.map.get(&val).unwrap() as f32 * self.stretch
+    }
+
+    // the value is still assumed to be within the min/max bounds
+    // (this is currently unused)
+    fn _map_inexact(&self, val: T) -> f32 {
+        let prev = *self.map.range(..=val).next().unwrap().1;
+        let next = *self.map.range(val..).next().unwrap().1;
+
+        // interpolate
+        (prev as f32 + (next - prev) as f32) * self.stretch
+    }
+
+    fn max(&self) -> f32 {
+        self.map.len() as f32 * self.stretch
+    }
+}
+
+#[derive(PartialEq, Hash, Eq)]
+pub enum LayerTraceOp {
+    Flush,
+    CreateDelta,
+    CreateImage,
+    Delete,
+}
+
+impl std::fmt::Display for LayerTraceOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        let op_str = match self {
+            LayerTraceOp::Flush => "flush",
+            LayerTraceOp::CreateDelta => "create_delta",
+            LayerTraceOp::CreateImage => "create_image",
+            LayerTraceOp::Delete => "delete",
+        };
+        f.write_str(op_str)
+    }
+}
+
+#[derive(PartialEq, Hash, Eq, Clone)]
+pub struct LayerTraceFile {
+    pub filename: String,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+}
+
+impl LayerTraceFile {
+    fn is_image(&self) -> bool {
+        self.lsn_range.end == self.lsn_range.start
+    }
+}
+
+pub struct LayerTraceEvent {
+    pub time_rel: u64,
+    pub op: LayerTraceOp,
+    pub file: LayerTraceFile,
+}
+
+pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
+    let mut files: Vec<LayerTraceFile> = Vec::new();
+
+    for event in history {
+        files.push(event.file.clone());
+    }
+    let last_time_rel = history.last().unwrap().time_rel;
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for f in files.iter() {
+        keys.push(f.key_range.start);
+        keys.push(f.key_range.end);
+        lsns.push(f.lsn_range.start);
+        lsns.push(f.lsn_range.end);
+    }
+
+    // Analyze
+    let key_map = CoordinateMap::new(keys, 2.0);
+    // Stretch out vertically for better visibility
+    let lsn_map = CoordinateMap::new(lsns, 3.0);
+
+    let mut svg = String::new();
+
+    // Draw
+    writeln!(
+        svg,
+        "{}",
+        BeginSvg {
+            w: key_map.max(),
+            h: lsn_map.max(),
+        }
+    )?;
+    let lsn_max = lsn_map.max();
+
+    // Sort the files by LSN, but so that image layers go after all delta layers
+    // The SVG is painted in the order the elements appear, and we want to draw
+    // image layers on top of the delta layers if they overlap
+    //
+    // (This could also be implemented via z coordinates: image layers get one z
+    // coord, delta layers get another z coord.)
+    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
+    files_sorted.sort_by(|a, b| {
+        if a.is_image() && !b.is_image() {
+            Ordering::Greater
+        } else if !a.is_image() && b.is_image() {
+            Ordering::Less
+        } else {
+            a.lsn_range.end.cmp(&b.lsn_range.end)
+        }
+    });
+
+    writeln!(svg, "<!-- layers -->")?;
+    let mut files_seen = HashSet::new();
+    for f in files_sorted {
+        if files_seen.contains(&f) {
+            continue;
+        }
+        let key_start = key_map.map(f.key_range.start);
+        let key_end = key_map.map(f.key_range.end);
+        let key_diff = key_end - key_start;
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = lsn_map.map(f.lsn_range.start);
+        let lsn_end = lsn_map.map(f.lsn_range.end);
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        let mut style = Style::default();
+        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
+
+        let y_start = lsn_max - lsn_start;
+        let y_end = lsn_max - lsn_end;
+
+        let x_margin = 0.25;
+        let y_margin = 0.5;
+
+        match f.lsn_range.start.cmp(&f.lsn_range.end) {
+            Ordering::Less => {
+                write!(
+                    svg,
+                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end + y_margin,
+                    key_diff - x_margin * 2.0,
+                    y_start - y_end - y_margin * 2.0,
+                    1.0, // border_radius,
+                    style,
+                )?;
+                write!(svg, "<title>{}</title>", f.filename)?;
+                writeln!(svg, "</rect>")?;
+            }
+            Ordering::Equal => {
+                //lsn_diff = 0.3;
+                //lsn_offset = -lsn_diff / 2.0;
+                //margin = 0.05;
+                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
+                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
+                write!(
+                    svg,
+                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end,
+                    key_end - x_margin,
+                    y_end,
+                    style,
+                )?;
+                write!(
+                    svg,
+                    "<title>{}<br>{} - {}</title>",
+                    f.filename, lsn_end, y_end
+                )?;
+                writeln!(svg, "</line>")?;
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+        files_seen.insert(f);
+    }
+
+    let mut record_style = Style::default();
+    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+    record_style.stroke = Stroke::None;
+
+    writeln!(svg, "{}", EndSvg)?;
+
+    let mut layer_events_str = String::new();
+    let mut first = true;
+    for e in history {
+        if !first {
+            writeln!(layer_events_str, ",")?;
+        }
+        write!(
+            layer_events_str,
+            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
+            e.time_rel, e.file.filename, e.op
+        )?;
+        first = false;
+    }
+    writeln!(layer_events_str)?;
+
+    writeln!(
+        output,
+        r#"<!DOCTYPE html>
+<html>
+<head>
+<style>
+/* Keep the slider pinned at top */
+.topbar {{
+  display: block;
+  overflow: hidden;
+  background-color: lightgrey;
+  position: fixed;
+  top: 0;
+  width: 100%;
+/*  width: 500px; */
+}}
+.slidercontainer {{
+  float: left;
+  width: 50%;
+  margin-right: 200px;
+}}
+.slider {{
+  float: left;
+  width: 100%;
+}}
+.legend {{
+  width: 200px;
+  float: right;
+}}
+
+/* Main content */
+.main {{
+  margin-top: 50px; /* Add a top margin to avoid content overlay */
+}}
+</style>
+</head>
+
+  <body onload="init()">
+    <script type="text/javascript">
+
+      var layer_events = [{layer_events_str}]
+
+      let ticker;
+
+      function init() {{
+          for (let i = 0; i < layer_events.length; i++) {{
+              var layer = document.getElementById("layer_" + layer_events[i].filename);
+              layer.style.visibility = "hidden";
+          }}
+          last_layer_event = -1;
+          moveSlider(last_slider_pos)
+      }}
+
+      function startAnimation() {{
+          ticker = setInterval(animateStep, 100);
+      }}
+      function stopAnimation() {{
+          clearInterval(ticker);
+      }}
+
+      function animateStep() {{
+          if (last_layer_event < layer_events.length - 1) {{
+              var slider = document.getElementById("time-slider");
+              let prevPos = slider.value
+              let nextEvent = last_layer_event + 1
+              while (nextEvent <= layer_events.length - 1) {{
+                  if (layer_events[nextEvent].time_rel > prevPos) {{
+                      break;
+                  }}
+                  nextEvent += 1;
+              }}
+              let nextPos = layer_events[nextEvent].time_rel
+              slider.value = nextPos
+              moveSlider(nextPos)
+          }}
+      }}
+
+      function redoLayerEvent(n, dir) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "visible";
+                  break;
+              case "delete":
+                  layer.style.visibility = "hidden";
+                  break;
+          }}
+      }}
+      function undoLayerEvent(n) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "delete":
+                  layer.style.visibility = "visible";
+                  break;
+          }}
+      }}
+
+      var last_slider_pos = 0
+      var last_layer_event = 0
+
+      var moveSlider = function(new_pos) {{
+          if (new_pos > last_slider_pos) {{
+              while (last_layer_event < layer_events.length - 1) {{
+                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
+                      break;
+                  }}
+                  last_layer_event += 1;
+                  redoLayerEvent(last_layer_event)
+              }}
+          }}
+          if (new_pos < last_slider_pos) {{
+              while (last_layer_event >= 0) {{
+                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
+                      break;
+                  }}
+                  undoLayerEvent(last_layer_event)
+                  last_layer_event -= 1;
+              }}
+          }}
+          last_slider_pos = new_pos;
+          document.getElementById("debug_pos").textContent=new_pos;
+          if (last_layer_event >= 0) {{
+              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
+          }} else {{
+              document.getElementById("debug_layer_event").textContent="begin";
+          }}
+      }}
+    </script>
+
+    <div class="topbar">
+      <div class="slidercontainer">
+        <label for="time-slider">TIME</label>:
+        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
+
+        pos: <span id="debug_pos"></span><br>
+        event: <span id="debug_layer_event"></span><br>
+        gc: <span id="debug_gc_event"></span><br>
+      </div>
+
+      <button onclick="startAnimation()">Play</button>
+      <button onclick="stopAnimation()">Stop</button>
+
+      <svg class="legend">
+        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+      </svg>
+    </div>
+
+    <div class="main">
+{svg}
+    </div>
+  </body>
+</html>
+"#
+    )?;
+
+    Ok(())
+}
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -0,0 +1,35 @@
+use pageserver_compaction::interface::CompactionLayer;
+use pageserver_compaction::simulator::MockTimeline;
+
+/// Test the extreme case that there are so many updates for a single key that
+/// even if we produce an extremely narrow delta layer, spanning just that one
+/// key, we still too many records to fit in the target file size. We need to
+/// split in the LSN dimension too in that case.
+///
+/// TODO: The code to avoid this problem has not been implemented yet! So the
+/// assertion currently fails, but we need to make it not fail.
+#[ignore]
+#[tokio::test]
+async fn test_many_updates_for_single_key() {
+    let mut executor = MockTimeline::new();
+    executor.target_file_size = 10_000_000; // 10 MB
+
+    // Ingest 100 MB of updates to a single key.
+    for _ in 1..1000 {
+        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
+        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.compact().await.unwrap();
+    }
+
+    // Check that all the layers are smaller than the target size (with some slop)
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+    for l in executor.live_layers.iter() {
+        assert!(l.file_size() < executor.target_file_size * 2);
+        // sanity check that none of the delta layers are stupidly small either
+        if l.is_delta() {
+            assert!(l.file_size() > executor.target_file_size / 2);
+        }
+    }
+}
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, str};

-use pageserver::page_cache::PAGE_SZ;
+use pageserver::page_cache::{self, PAGE_SZ};
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        file,
+        block_reader,
    );
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,13 +61,15 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        &file,
+        &block_reader,
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
@@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
            ctx,
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&file);
+    let cursor = BlockCursor::new_fileblockreader(&block_reader);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
 use rand::prelude::*;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{debug, info, instrument};
+use tracing::{info, instrument};

 use std::collections::HashMap;
 use std::num::NonZeroUsize;
@@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "localhost:64000")]
-    page_service_host_port: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    #[clap(long, default_value = "1")]
@@ -230,12 +230,9 @@ async fn client(
 ) {
    start_work_barrier.wait().await;

-    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
-        &args.page_service_host_port,
-        args.pageserver_jwt.as_deref(),
-    ))
-    .await
-    .unwrap();
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();

    while let Some(Work { lsn, gzip }) = work.recv().await {
        let start = Instant::now();
@@ -263,7 +260,7 @@ async fn client(
                }
            })
            .await;
-        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
        let elapsed = start.elapsed();
        live_stats.inc();
        STATS.with(|stats| {
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -3,7 +3,6 @@ use utils::logging;

 /// Re-usable pieces of code that aren't CLI-specific.
 mod util {
-    pub(crate) mod connstring;
    pub(crate) mod request_stats;
    #[macro_use]
    pub(crate) mod tokio_thread_local_stats;
--- a/pageserver/pagebench/src/util/connstring.rs
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -1,8 +0,0 @@
-pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
-    let colon_and_jwt = if let Some(jwt) = jwt {
-        format!(":{jwt}") // TODO: urlescape
-    } else {
-        String::new()
-    };
-    format!("postgres://postgres{colon_and_jwt}@{host_port}")
-}
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,7 +14,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
            format!(
                "JWT scope '{:?}' is ineligible for Pageserver auth",
                claims.scope
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -143,6 +143,7 @@ where
    ar: &'a mut Builder<&'b mut W>,
    buf: Vec<u8>,
    current_segment: Option<(SlruKind, u32)>,
+    total_blocks: usize,
 }

 impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
@@ -154,6 +155,7 @@ where
            ar,
            buf: Vec::new(),
            current_segment: None,
+            total_blocks: 0,
        }
    }

@@ -199,7 +201,8 @@ where
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
        self.ar.append(&header, self.buf.as_slice()).await?;

-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+        self.total_blocks += nblocks;
+        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);

        self.buf.clear();

@@ -207,11 +210,15 @@ where
    }

    async fn finish(mut self) -> anyhow::Result<()> {
-        if self.current_segment.is_none() || self.buf.is_empty() {
-            return Ok(());
-        }
+        let res = if self.current_segment.is_none() || self.buf.is_empty() {
+            Ok(())
+        } else {
+            self.flush().await
+        };

-        self.flush().await
+        info!("Collected {} SLRU blocks", self.total_blocks);
+
+        res
    }
 }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -20,7 +20,6 @@ use std::num::NonZeroUsize;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use toml_edit;
 use toml_edit::{Document, Item};

 use camino::{Utf8Path, Utf8PathBuf};
@@ -34,12 +33,13 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
+use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::virtual_file;
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

@@ -87,6 +87,10 @@ pub mod defaults {

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
+
    ///
    /// Default built-in configuration file.
    ///
@@ -126,6 +130,10 @@ pub mod defaults {

 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'

+#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
+
+#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -140,7 +148,6 @@ pub mod defaults {

 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-#gc_feedback = false

 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
@@ -204,9 +211,9 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
-    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
-    /// loading such tenants, vs. other work in the system.
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
+    ///
+    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
    pub concurrent_tenant_warmup: ConfigurableSemaphore,

    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
@@ -263,6 +270,10 @@ pub struct PageServerConf {
    pub virtual_file_io_engine: virtual_file::IoEngineKind,

    pub get_vectored_impl: GetVectoredImpl,
+
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+
+    pub validate_vectored_get: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -351,6 +362,10 @@ struct PageServerConfigBuilder {
    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,

    get_vectored_impl: BuilderValue<GetVectoredImpl>,
+
+    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
+
+    validate_vectored_get: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -430,6 +445,10 @@ impl Default for PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
        }
    }
 }
@@ -594,6 +613,14 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }

+    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
+        self.max_vectored_read_bytes = BuilderValue::Set(value);
+    }
+
+    pub fn get_validate_vectored_get(&mut self, value: bool) {
+        self.validate_vectored_get = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -707,6 +734,12 @@ impl PageServerConfigBuilder {
            get_vectored_impl: self
                .get_vectored_impl
                .ok_or(anyhow!("missing get_vectored_impl"))?,
+            max_vectored_read_bytes: self
+                .max_vectored_read_bytes
+                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
+            validate_vectored_get: self
+                .validate_vectored_get
+                .ok_or(anyhow!("missing validate_vectored_get"))?,
        })
    }
 }
@@ -826,17 +859,6 @@ impl PageServerConf {
            .join(connection_id.to_string())
    }

-    /// Points to a place in pageserver's local directory,
-    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-    ) -> Utf8PathBuf {
-        self.timeline_path(tenant_shard_id, timeline_id)
-            .join(METADATA_FILE_NAME)
-    }
-
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -964,6 +986,15 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
+                "max_vectored_read_bytes" => {
+                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
+                    builder.get_max_vectored_read_bytes(
+                        MaxVectoredReadBytes(
+                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
+                }
+                "validate_vectored_get" => {
+                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1039,6 +1070,11 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            max_vectored_read_bytes: MaxVectoredReadBytes(
+                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                    .expect("Invalid default constant"),
+            ),
+            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
        }
    }
 }
@@ -1166,10 +1202,7 @@ impl ConfigurableSemaphore {

 #[cfg(test)]
 mod tests {
-    use std::{
-        fs,
-        num::{NonZeroU32, NonZeroUsize},
-    };
+    use std::{fs, num::NonZeroU32};

    use camino_tempfile::{tempdir, Utf8TempDir};
    use pageserver_api::models::EvictionPolicy;
@@ -1273,6 +1306,11 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1338,6 +1376,11 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -17,7 +17,7 @@ use tracing::*;
 use utils::id::NodeId;

 mod metrics;
-use metrics::MetricsKey;
+use crate::consumption_metrics::metrics::MetricsKey;
 mod disk_cache;
 mod upload;

--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,7 +1,5 @@
 use super::*;
 use std::collections::HashMap;
-use std::time::SystemTime;
-use utils::lsn::Lsn;

 #[test]
 fn startup_collected_timeline_metrics_before_advancing() {
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,10 +2,10 @@ use std::collections::HashMap;

 use futures::Future;
 use pageserver_api::{
-    control_api::{
+    shard::TenantShardId,
+    upcall_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
    },
-    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
 use thiserror::Error;
-use tokio;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use tracing::{self, debug, error};
+use tracing::{debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -234,7 +233,7 @@ impl DeletionHeader {
        let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
        let header_path = conf.deletion_header_path();
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
            .await
            .maybe_fatal_err("save deletion header")?;

@@ -325,7 +324,8 @@ impl DeletionList {
        let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);

        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
+
+        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
            .await
            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
@@ -725,7 +725,7 @@ mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
    use pageserver_api::shard::ShardIndex;
-    use std::{io::ErrorKind, time::Duration};
+    use std::io::ErrorKind;
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -734,10 +734,7 @@ mod test {
    use crate::{
        control_plane_client::RetryForeverError,
        repository::Key,
-        tenant::{
-            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
-            storage_layer::DeltaFileName,
-        },
+        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
    };

    use super::*;
@@ -1160,13 +1157,8 @@ mod test {
 pub(crate) mod mock {
    use tracing::info;

-    use crate::tenant::remote_timeline_client::remote_layer_path;
-
    use super::*;
-    use std::sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    };
+    use std::sync::atomic::{AtomicUsize, Ordering};

    pub struct ConsumerState {
        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -58,6 +58,7 @@ use utils::{completion, id::TimelineId};

 use crate::{
    config::PageServerConf,
+    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        self,
@@ -65,7 +66,6 @@ use crate::{
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
-        Timeline,
    },
 };

@@ -409,13 +409,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates =
+    let (candidates, collection_time) = {
+        let started_at = std::time::Instant::now();
        match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
            EvictionCandidates::Cancelled => {
                return Ok(IterationOutcome::Cancelled);
            }
-            EvictionCandidates::Finished(partitioned) => partitioned,
-        };
+            EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
+        }
+    };
+
+    METRICS.layers_collected.inc_by(candidates.len() as u64);
+
+    tracing::info!(
+        elapsed_ms = collection_time.as_millis(),
+        total_layers = candidates.len(),
+        "collection completed"
+    );

    // Debug-log the list of candidates
    let now = SystemTime::now();
@@ -446,9 +456,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.

-    let selection = select_victims(&candidates, usage_pre);
+    let (evicted_amount, usage_planned) =
+        select_victims(&candidates, usage_pre).into_amount_and_planned();

-    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
+    METRICS.layers_selected.inc_by(evicted_amount as u64);

    // phase2: evict layers

@@ -477,9 +488,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            if let Some(next) = next {
                match next {
                    Ok(Ok(file_size)) => {
+                        METRICS.layers_evicted.inc();
                        usage_assumed.add_available_bytes(file_size);
                    }
-                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
+                    Ok(Err((
+                        file_size,
+                        EvictionError::NotFound
+                        | EvictionError::Downloaded
+                        | EvictionError::Timeout,
+                    ))) => {
                        evictions_failed.file_sizes += file_size;
                        evictions_failed.count += 1;
                    }
@@ -495,7 +512,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

            // calling again when consumed_all is fine as evicted is fused.
            let Some((_partition, candidate)) = evicted.next() else {
-                consumed_all = true;
+                if !consumed_all {
+                    tracing::info!("all evictions started, waiting");
+                    consumed_all = true;
+                }
                continue;
            };

@@ -503,11 +523,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                EvictionLayer::Attached(layer) => {
                    let file_size = layer.layer_desc().file_size;
                    js.spawn(async move {
-                        layer
-                            .evict_and_wait()
-                            .await
-                            .map(|()| file_size)
-                            .map_err(|e| (file_size, e))
+                        // have a low eviction waiting timeout because our LRU calculations go stale fast;
+                        // also individual layer evictions could hang because of bugs and we do not want to
+                        // pause disk_usage_based_eviction for such.
+                        let timeout = std::time::Duration::from_secs(5);
+
+                        match layer.evict_and_wait(timeout).await {
+                            Ok(()) => Ok(file_size),
+                            Err(e) => Err((file_size, e)),
+                        }
                    });
                }
                EvictionLayer::Secondary(layer) => {
@@ -529,6 +553,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        (usage_assumed, evictions_failed)
    };

+    let started_at = std::time::Instant::now();
+
+    let evict_layers = async move {
+        let mut evict_layers = std::pin::pin!(evict_layers);
+
+        let maximum_expected = std::time::Duration::from_secs(10);
+
+        let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
+        let tuple = if let Ok(tuple) = res {
+            tuple
+        } else {
+            let elapsed = started_at.elapsed();
+            tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
+            evict_layers.await
+        };
+
+        let elapsed = started_at.elapsed();
+        tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
+        tuple
+    };
+
+    let evict_layers =
+        evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
+
    let (usage_assumed, evictions_failed) = tokio::select! {
        tuple = evict_layers => { tuple },
        _ = cancel.cancelled() => {
@@ -763,6 +811,8 @@ async fn collect_eviction_candidates(
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
+    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
+
    // get a snapshot of the list of tenants
    let tenants = tenant::mgr::list_tenants()
        .await
@@ -791,6 +841,8 @@ async fn collect_eviction_candidates(
            continue;
        }

+        let started_at = std::time::Instant::now();
+
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
@@ -805,6 +857,7 @@ async fn collect_eviction_candidates(
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+
            tenant_candidates.extend(info.resident_layers.into_iter());
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));

@@ -870,7 +923,25 @@ async fn collect_eviction_candidates(
                    (partition, candidate)
                });

+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
+
        candidates.extend(tenant_candidates);
+
+        let elapsed = started_at.elapsed();
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -885,11 +956,11 @@ async fn collect_eviction_candidates(
        },
    );

-    for secondary_tenant in secondary_tenants {
+    for tenant in secondary_tenants {
        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
        // to prevent repeated disk usage based evictions from completely draining less often
        // updating secondaries.
-        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
+        let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();

        debug_assert!(
            total_layers >= layer_info.resident_layers.len(),
@@ -897,6 +968,8 @@ async fn collect_eviction_candidates(
            layer_info.resident_layers.len()
        );

+        let started_at = std::time::Instant::now();
+
        layer_info
            .resident_layers
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -918,9 +991,27 @@ async fn collect_eviction_candidates(
                    )
                });

+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
        candidates.extend(tenant_candidates);

        tokio::task::yield_now().await;
+
+        let elapsed = started_at.elapsed();
+
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -997,30 +1088,6 @@ impl<U: Usage> VictimSelection<U> {
    }
 }

-struct TimelineKey(Arc<Timeline>);
-
-impl PartialEq for TimelineKey {
-    fn eq(&self, other: &Self) -> bool {
-        Arc::ptr_eq(&self.0, &other.0)
-    }
-}
-
-impl Eq for TimelineKey {}
-
-impl std::hash::Hash for TimelineKey {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        Arc::as_ptr(&self.0).hash(state);
-    }
-}
-
-impl std::ops::Deref for TimelineKey {
-    type Target = Timeline;
-
-    fn deref(&self) -> &Self::Target {
-        self.0.as_ref()
-    }
-}
-
 /// A totally ordered f32 subset we can use with sorting functions.
 pub(crate) mod finite_f32 {

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,114 +567,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
-
-  /v1/tenant/{tenant_id}/attach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Schedules attach operation to happen in the background for the given tenant.
-        As soon as the caller sends this request, it must assume the pageserver
-        starts writing to the tenant's S3 state unless it receives one of the
-        distinguished errors below that state otherwise.
-
-        If a client receives a not-distinguished response, e.g., a network timeout,
-        it MUST retry the /attach request and poll again for the tenant's
-        attachment status.
-
-        After the client has received a 202, it MUST poll the tenant's
-        attachment status (field `attachment_status`) to reach state `attached`.
-        If the `attachment_status` is missing, the client MUST retry the `/attach`
-        request (goto previous paragraph). This is a robustness measure in case the tenant
-        status endpoint is buggy, but the attach operation is ongoing.
-
-        There is no way to cancel an in-flight request.
-
-        In any case, the client
-        * MUST NOT ASSUME that the /attach request has been lost in the network,
-        * MUST NOT ASSUME that the request has been lost, based on the observation
-          that a subsequent tenant status request returns 404. The request may
-          still be in flight. It must be retried.
-
-        The client SHOULD supply a `TenantConfig` for the tenant in the request body.
-        Settings specified in the config override the pageserver's defaults.
-        It is guaranteed that the config settings are applied before the pageserver
-        starts operating on the tenant. E.g., if the config specifies a specific
-        PITR interval for a tenant, then that setting will be in effect before the
-        pageserver starts the garbage collection loop. This enables a client to
-        guarantee a specific PITR setting across detach/attach cycles.
-        The pageserver will reject the request if it cannot parse the config, or
-        if there are any unknown fields in it.
-
-        If the client does not supply a config, the pageserver will use its defaults.
-        This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantAttachRequest"
-      responses:
-        "202":
-          description: Tenant attaching scheduled
-        "400":
-          description: Bad Request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
  /v1/tenant/{tenant_id}/location_config:
    parameters:
      - name: tenant_id
@@ -687,6 +579,12 @@ paths:
        required: false
        schema:
          type: integer
+      - name: lazy
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
    put:
      description: |
        Configures a _tenant location_, that is how a particular pageserver handles
@@ -770,66 +668,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-
-  /v1/tenant/{tenant_id}/detach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: detach_ignored
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description: |
-          When true, allow to detach a tenant which state is ignored.
-    post:
-      description: |
-        Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
-        Files on the remote storage are not affected.
-      responses:
-        "200":
-          description: Tenant detached
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenant not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
  /v1/tenant/{tenant_id}/ignore:
    parameters:
      - name: tenant_id
@@ -1464,16 +1302,6 @@ components:
        generation:
          type: integer
          description: Attachment generation number.
-    TenantAttachRequest:
-      type: object
-      required:
-        - config
-      properties:
-        config:
-          $ref: '#/components/schemas/TenantConfig'
-        generation:
-          type: integer
-          description: Attachment generation number.
    TenantConfigRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -661,9 +661,14 @@ async fn timeline_detail_handler(

    // Logical size calculation needs downloading.
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let state = get_state(&request);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -696,6 +701,7 @@ async fn get_lsn_by_timestamp_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
@@ -712,7 +718,10 @@ async fn get_lsn_by_timestamp_handler(
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
@@ -743,6 +752,7 @@ async fn get_timestamp_of_lsn_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
@@ -759,7 +769,9 @@ async fn get_timestamp_of_lsn_handler(
        .map_err(ApiError::BadRequest)?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
@@ -804,13 +816,7 @@ async fn tenant_attach_handler(

    let tenant = state
        .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            None,
-            SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
        .await?;

    let Some(tenant) = tenant else {
@@ -1159,10 +1165,13 @@ async fn layer_map_info_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
        parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
+    let state = get_state(&request);

    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
@@ -1176,8 +1185,11 @@ async fn layer_download_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let downloaded = timeline
        .download_layer(layer_file_name)
        .await
@@ -1201,8 +1213,11 @@ async fn evict_timeline_layer_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
+    let state = get_state(&request);

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let evicted = timeline
        .evict_layer(layer_file_name)
        .await
@@ -1397,6 +1412,7 @@ async fn put_tenant_location_config_handler(

    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
    let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
+    let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1427,15 +1443,17 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

+    // lazy==true queues up for activation or jumps the queue like normal when a compute connects,
+    // similar to at startup ordering.
+    let spawn_mode = if lazy {
+        tenant::SpawnMode::Lazy
+    } else {
+        tenant::SpawnMode::Eager
+    };
+
    let attached = state
        .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            flush,
-            tenant::SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
        .await?
        .is_some();

@@ -1612,13 +1630,19 @@ async fn timeline_compact_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let state = get_state(&request);
+
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
@@ -1638,13 +1662,19 @@ async fn timeline_checkpoint_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let state = get_state(&request);
+
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        timeline
            .freeze_and_flush()
            .await
@@ -1669,7 +1699,11 @@ async fn timeline_download_remote_layers_handler_post(
    let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    match timeline.spawn_download_all_remote_layers(body).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1683,8 +1717,11 @@ async fn timeline_download_remote_layers_handler_get(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);

-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
@@ -1733,6 +1770,7 @@ async fn getpage_at_lsn_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    struct Key(crate::repository::Key);

@@ -1751,7 +1789,7 @@ async fn getpage_at_lsn_handler(

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;

        let page = timeline.get(key.0, lsn, &ctx).await?;

@@ -1774,12 +1812,13 @@ async fn timeline_collect_keyspace(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -1795,10 +1834,14 @@ async fn timeline_collect_keyspace(
 }

 async fn active_timeline_of_active_tenant(
+    tenant_manager: &TenantManager,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -169,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }

-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -642,26 +642,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });

-// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
-// or in testing they estimate how much we would upload if we did.
-static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_created_persistent_files_total",
-        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
-static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_written_persistent_bytes_total",
-        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_eviction_iteration_duration_seconds_global",
@@ -1802,8 +1782,6 @@ pub(crate) struct TimelineMetrics {
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
-    pub num_persistent_files_created: IntCounter,
-    pub persistent_bytes_written: IntCounter,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }
@@ -1885,12 +1863,6 @@ impl TimelineMetrics {
        };
        let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
            Lazy::new(Box::new(directory_entries_count_gauge_closure));
-        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -1912,8 +1884,6 @@ impl TimelineMetrics {
            resident_physical_size_gauge,
            current_logical_size_gauge,
            directory_entries_count_gauge,
-            num_persistent_files_created,
-            persistent_bytes_written,
            evictions,
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
@@ -1923,8 +1893,6 @@ impl TimelineMetrics {

    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
-        self.num_persistent_files_created.inc_by(1);
-        self.persistent_bytes_written.inc_by(sz);
    }

    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
@@ -1947,20 +1915,16 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ =
-                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
-            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
-        let _ =
-            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -2509,6 +2473,64 @@ pub(crate) mod tenant_throttling {
    }
 }

+pub(crate) mod disk_usage_based_eviction {
+    use super::*;
+
+    pub(crate) struct Metrics {
+        pub(crate) tenant_collection_time: Histogram,
+        pub(crate) tenant_layer_count: Histogram,
+        pub(crate) layers_collected: IntCounter,
+        pub(crate) layers_selected: IntCounter,
+        pub(crate) layers_evicted: IntCounter,
+    }
+
+    impl Default for Metrics {
+        fn default() -> Self {
+            let tenant_collection_time = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
+                "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
+                vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
+            )
+            .unwrap();
+
+            let tenant_layer_count = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collected_layers",
+                "Amount of layers gathered from a tenant",
+                vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
+            )
+            .unwrap();
+
+            let layers_collected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_collected_layers_total",
+                "Amount of layers collected"
+            )
+            .unwrap();
+
+            let layers_selected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_select_layers_total",
+                "Amount of layers selected"
+            )
+            .unwrap();
+
+            let layers_evicted = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_evicted_layers_total",
+                "Amount of layers successfully evicted"
+            )
+            .unwrap();
+
+            Self {
+                tenant_collection_time,
+                tenant_layer_count,
+                layers_collected,
+                layers_selected,
+                layers_evicted,
+            }
+        }
+    }
+
+    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+}
+
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
@@ -2543,6 +2565,7 @@ pub fn preinitialize_metrics() {
    Lazy::force(&TENANT_MANAGER);

    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
+    Lazy::force(&disk_usage_based_eviction::METRICS);

    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -73,7 +73,6 @@

 use std::{
    collections::{hash_map::Entry, HashMap},
-    convert::TryInto,
    sync::{
        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
        Arc, Weak,
@@ -262,7 +261,9 @@ pub struct PageCache {
    size_metrics: &'static PageCacheSizeMetrics,
 }

-struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+struct PinnedSlotsPermit {
+    _permit: tokio::sync::OwnedSemaphorePermit,
+}

 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -558,9 +559,9 @@ impl PageCache {
        )
        .await
        {
-            Ok(res) => Ok(PinnedSlotsPermit(
-                res.expect("this semaphore is never closed"),
-            )),
+            Ok(res) => Ok(PinnedSlotsPermit {
+                _permit: res.expect("this semaphore is never closed"),
+            }),
            Err(_timeout) => {
                crate::metrics::page_cache_errors_inc(
                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -27,7 +27,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,7 +44,6 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
-use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
@@ -1115,7 +1114,10 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
        let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => tl,
+            Ok(tl) => {
+                set_tracing_field_shard_id(tl);
+                tl
+            }
            Err(key) => {
                match self
                    .load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1140,9 +1142,6 @@ impl PageServerHandler {
            }
        };

-        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
-        set_tracing_field_shard_id(timeline);
-
        let _timer = timeline
            .query_metrics
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,7 +15,6 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -36,6 +35,8 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

+const MAX_AUX_FILE_DELTAS: usize = 1024;
+
 #[derive(Debug)]
 pub enum LsnForTimestamp {
    /// Found commits both before and after the given timestamp
@@ -157,7 +158,6 @@ impl Timeline {
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
-            pending_aux_files: None,
            pending_directory_entries: Vec::new(),
            lsn,
        }
@@ -873,11 +873,6 @@ pub struct DatadirModification<'a> {
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

-    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
-    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
-    // if AUX_FILES_KEY is already set.
-    pending_aux_files: Option<AuxFilesDirectory>,
-
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1401,19 +1396,28 @@ impl<'a> DatadirModification<'a> {
            Some(Bytes::copy_from_slice(content))
        };

-        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+        let n_files;
+        let mut aux_files = self.tline.aux_files.lock().await;
+        if let Some(mut dir) = aux_files.dir.take() {
            // We already updated aux files in `self`: emit a delta and update our latest value
-
-            self.put(
-                AUX_FILES_KEY,
-                Value::WalRecord(NeonWalRecord::AuxFile {
-                    file_path: file_path.clone(),
-                    content: content.clone(),
-                }),
-            );
-
-            dir.upsert(file_path, content);
-            dir
+            dir.upsert(file_path.clone(), content.clone());
+            n_files = dir.files.len();
+            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::Image(Bytes::from(
+                        AuxFilesDirectory::ser(&dir).context("serialize")?,
+                    )),
+                );
+                aux_files.n_deltas = 0;
+            } else {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
+                );
+                aux_files.n_deltas += 1;
+            }
+            aux_files.dir = Some(dir);
        } else {
            // Check if the AUX_FILES_KEY is initialized
            match self.get(AUX_FILES_KEY, ctx).await {
@@ -1428,7 +1432,8 @@ impl<'a> DatadirModification<'a> {
                        }),
                    );
                    dir.upsert(file_path, content);
-                    dir
+                    n_files = dir.files.len();
+                    aux_files.dir = Some(dir);
                }
                Err(
                    e @ (PageReconstructError::AncestorStopping(_)
@@ -1455,14 +1460,14 @@ impl<'a> DatadirModification<'a> {
                            AuxFilesDirectory::ser(&dir).context("serialize")?,
                        )),
                    );
-                    dir
+                    n_files = 1;
+                    aux_files.dir = Some(dir);
                }
            }
-        };
+        }

        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, dir.files.len()));
-        self.pending_aux_files = Some(dir);
+            .push((DirectoryKind::AuxFiles, n_files));

        Ok(())
    }
@@ -1493,7 +1498,7 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1532,23 +1537,13 @@ impl<'a> DatadirModification<'a> {
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;

        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
-
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
-                .into_iter()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
-
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(&self.pending_updates, ctx).await?;
            self.pending_updates.clear();
        }

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,6 @@ impl Value {
 mod test {
    use super::*;

-    use bytes::Bytes;
    use utils::bin_ser::BeSer;

    macro_rules! roundtrip {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -29,7 +29,6 @@ use remote_storage::TimeoutOrCancel;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
-use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -110,7 +109,6 @@ pub use pageserver_api::models::TenantState;
 use tokio::sync::Semaphore;

 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
-use toml_edit;
 use utils::{
    crashsafe,
    generation::Generation,
@@ -146,6 +144,7 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
+pub mod vectored_blob_io;

 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
@@ -172,9 +171,6 @@ pub(crate) mod throttle;
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

-// re-export for use in remote_timeline_client.rs
-pub use crate::tenant::metadata::save_metadata;
-
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;

@@ -230,7 +226,11 @@ pub(crate) struct TenantPreload {
 /// When we spawn a tenant, there is a special mode for tenant creation that
 /// avoids trying to read anything from remote storage.
 pub(crate) enum SpawnMode {
-    Normal,
+    /// Activate as soon as possible
+    Eager,
+    /// Lazy activation in the background, with the option to skip the queue if the need comes up
+    Lazy,
+    /// Tenant has been created during the lifetime of this process
    Create,
 }

@@ -703,41 +703,37 @@ impl Tenant {
                    .and_then(|x| x.initial_tenant_load_remote.take());

                enum AttachType<'a> {
-                    // During pageserver startup, we are attaching this tenant lazily in the background
-                    Warmup(tokio::sync::SemaphorePermit<'a>),
-                    // During pageserver startup, we are attaching this tenant as soon as we can,
-                    // because a client tried to access it.
+                    /// We are attaching this tenant lazily in the background.
+                    Warmup {
+                        _permit: tokio::sync::SemaphorePermit<'a>,
+                        during_startup: bool
+                    },
+                    /// We are attaching this tenant as soon as we can, because for example an
+                    /// endpoint tried to access it.
                    OnDemand,
-                    // During normal operations after startup, we are attaching a tenant.
+                    /// During normal operations after startup, we are attaching a tenant, and
+                    /// eager attach was requested.
                    Normal,
                }

-                // Before doing any I/O, wait for either or:
-                // - A client to attempt to access to this tenant (on-demand loading)
-                // - A permit to become available in the warmup semaphore (background warmup)
-                //
-                // Some-ness of init_order is how we know if we're attaching during startup or later
-                // in process lifetime.
-                let attach_type = if init_order.is_some() {
+                let attach_type = if matches!(mode, SpawnMode::Lazy) {
+                    // Before doing any I/O, wait for at least one of:
+                    // - A client attempting to access to this tenant (on-demand loading)
+                    // - A permit becoming available in the warmup semaphore (background warmup)
+
                    tokio::select!(
-                        _ = tenant_clone.activate_now_sem.acquire() => {
+                        permit = tenant_clone.activate_now_sem.acquire() => {
+                            let _ = permit.expect("activate_now_sem is never closed");
                            tracing::info!("Activating tenant (on-demand)");
                            AttachType::OnDemand
                        },
-                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
-                            match permit_result {
-                                Ok(p) => {
-                                    tracing::info!("Activating tenant (warmup)");
-                                    AttachType::Warmup(p)
-                                }
-                                Err(_) => {
-                                    // This is unexpected: the warmup semaphore should stay alive
-                                    // for the lifetime of init_order.  Log a warning and proceed.
-                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
-                                    AttachType::Normal
-                                }
+                        permit = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
+                            tracing::info!("Activating tenant (warmup)");
+                            AttachType::Warmup {
+                                _permit,
+                                during_startup: init_order.is_some()
                            }
-
                        }
                        _ = tenant_clone.cancel.cancelled() => {
                            // This is safe, but should be pretty rare: it is interesting if a tenant
@@ -752,6 +748,8 @@ impl Tenant {
                        },
                    )
                } else {
+                    // SpawnMode::{Create,Eager} always cause jumping ahead of the
+                    // concurrent_tenant_warmup queue
                    AttachType::Normal
                };

@@ -759,7 +757,7 @@ impl Tenant {
                    (SpawnMode::Create, _) => {
                        None
                    },
-                    (SpawnMode::Normal, Some(remote_storage)) => {
+                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
                            .preload(remote_storage, task_mgr::shutdown_token())
@@ -772,7 +770,7 @@ impl Tenant {
                            }
                        }
                    }
-                    (SpawnMode::Normal, None) => {
+                    (_, None) => {
                        let _preload_timer = TENANT.preload.start_timer();
                        None
                    }
@@ -831,7 +829,7 @@ impl Tenant {
                let attached = {
                    let _attach_timer = match mode {
                        SpawnMode::Create => None,
-                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
                    };
                    tenant_clone.attach(preload, mode, &ctx).await
                };
@@ -853,7 +851,7 @@ impl Tenant {
                // It also prevents the warmup proccess competing with the concurrency limit on
                // logical size calculations: if logical size calculation semaphore is saturated,
                // then warmup will wait for that before proceeding to the next tenant.
-                if let AttachType::Warmup(_permit) = attach_type {
+                if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
                    let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                    tracing::info!("Waiting for initial logical sizes while warming up...");
                    while futs.next().await.is_some() {}
@@ -926,7 +924,7 @@ impl Tenant {
                deleting: false,
                timelines: HashMap::new(),
            },
-            (None, SpawnMode::Normal) => {
+            (None, _) => {
                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
            }
        };
@@ -1151,17 +1149,6 @@ impl Tenant {
            None
        };

-        // timeline loading after attach expects to find metadata file for each metadata
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &remote_metadata,
-        )
-        .await
-        .context("save_metadata")
-        .map_err(LoadLocalTimelineError::Load)?;
-
        self.timeline_init_and_sync(
            timeline_id,
            resources,
@@ -2396,7 +2383,7 @@ impl Tenant {
            self.tenant_shard_id,
            self.generation,
            self.shard_identity,
-            self.walredo_mgr.as_ref().map(Arc::clone),
+            self.walredo_mgr.clone(),
            resources,
            pg_version,
            state,
@@ -2588,19 +2575,24 @@ impl Tenant {
        legacy_config_path: &Utf8Path,
        location_conf: &LocationConf,
    ) -> anyhow::Result<()> {
-        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
-        Self::persist_tenant_config_legacy(
-            tenant_shard_id,
-            legacy_config_path,
-            &location_conf.tenant_conf,
-        )
-        .await?;
-
        if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
-            // then drop out after writing legacy-style config.
+            // The modern-style LocationConf config file requires a generation to be set. In case someone
+            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
+            // config file that only contains TenantConf.
+            //
+            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
+
            if attach_conf.generation.is_none() {
-                tracing::debug!("Running without generations, not writing new-style LocationConf");
+                tracing::info!(
+                    "Running without generations, writing legacy-style tenant config file"
+                );
+                Self::persist_tenant_config_legacy(
+                    tenant_shard_id,
+                    legacy_config_path,
+                    &location_conf.tenant_conf,
+                )
+                .await?;
+
                return Ok(());
            }
        }
@@ -2623,17 +2615,10 @@ impl Tenant {

        let tenant_shard_id = *tenant_shard_id;
        let config_path = config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;

        Ok(())
    }
@@ -2660,17 +2645,12 @@ impl Tenant {

        let tenant_shard_id = *tenant_shard_id;
        let target_config_path = target_config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!("write tenant {tenant_shard_id} config to {target_config_path}")
+            })?;
        Ok(())
    }

@@ -3293,10 +3273,7 @@ impl Tenant {

        timeline_struct.init_empty_layer_map(start_lsn);

-        if let Err(e) = self
-            .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
-            .await
-        {
+        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
            error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
            cleanup_timeline_directory(uninit_mark);
            return Err(e);
@@ -3313,26 +3290,13 @@ impl Tenant {
        ))
    }

-    async fn create_timeline_files(
-        &self,
-        timeline_path: &Utf8Path,
-        new_timeline_id: &TimelineId,
-        new_metadata: &TimelineMetadata,
-    ) -> anyhow::Result<()> {
+    async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
        crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;

        fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
        });

-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            new_timeline_id,
-            new_metadata,
-        )
-        .await
-        .context("Failed to create timeline metadata")?;
        Ok(())
    }

@@ -3499,9 +3463,8 @@ impl Tenant {
            // Run each timeline's flush in a task holding the timeline's gate: this
            // means that if this function's future is cancelled, the Timeline shutdown
            // will still wait for any I/O in here to complete.
-            let gate = match timeline.gate.enter() {
-                Ok(g) => g,
-                Err(_) => continue,
+            let Ok(gate) = timeline.gate.enter() else {
+                continue;
            };
            let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
            results.push(jh);
@@ -3629,25 +3592,18 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
-    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
-    use std::fs;
-    use std::sync::Arc;
    use utils::logging;
-    use utils::lsn::Lsn;

    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::walredo::apply_neon;
-    use crate::{
-        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
-    };
+    use crate::{repository::Key, walrecord::NeonWalRecord};

    use super::*;
-    use crate::tenant::config::{TenantConf, TenantConfOpt};
    use hex_literal::hex;
-    use utils::id::{TenantId, TimelineId};
+    use utils::id::TenantId;

    pub const TIMELINE_ID: TimelineId =
        TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3671,6 +3627,7 @@ pub(crate) mod harness {
                compaction_target_size: Some(tenant_conf.compaction_target_size),
                compaction_period: Some(tenant_conf.compaction_period),
                compaction_threshold: Some(tenant_conf.compaction_threshold),
+                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                gc_horizon: Some(tenant_conf.gc_horizon),
                gc_period: Some(tenant_conf.gc_period),
                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
@@ -3684,7 +3641,6 @@ pub(crate) mod harness {
                evictions_low_residence_duration_metric_threshold: Some(
                    tenant_conf.evictions_low_residence_duration_metric_threshold,
                ),
-                gc_feedback: Some(tenant_conf.gc_feedback),
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
@@ -3807,7 +3763,7 @@ pub(crate) mod harness {
            let preload = tenant
                .preload(&self.remote_storage, CancellationToken::new())
                .await?;
-            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
+            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;

            tenant.state.send_replace(TenantState::Active);
            for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3876,10 +3832,8 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
-    use once_cell::sync::Lazy;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
-    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -3891,7 +3845,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3903,7 +3857,7 @@ mod tests {
        writer.finish_write(Lsn(0x10));
        drop(writer);

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3969,7 +3923,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;

        #[allow(non_snake_case)]
        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -4003,7 +3957,7 @@ mod tests {
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
-        let mut new_writer = newtline.writer().await;
+        let new_writer = newtline.writer().await;
        new_writer
            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
            .await?;
@@ -4035,7 +3989,7 @@ mod tests {
    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            // Create a relation on the timeline
            writer
                .put(
@@ -4060,7 +4014,7 @@ mod tests {
        }
        tline.freeze_and_flush().await?;
        {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    *TEST_KEY,
@@ -4423,7 +4377,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4440,7 +4394,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4457,7 +4411,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4474,7 +4428,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4531,7 +4485,7 @@ mod tests {
        for _ in 0..repeat {
            for _ in 0..key_count {
                test_key.field6 = blknum;
-                let mut writer = timeline.writer().await;
+                let writer = timeline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4702,7 +4656,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4723,7 +4677,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4791,7 +4745,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4820,7 +4774,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4897,7 +4851,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
--- a/Show More
+++ b/Show More