proxy: introdice is cold start for analytics

2026-01-24 05:40:36 +00:00 · 2024-02-26 10:12:04 +01:00
180 changed files with 2190 additions and 8891 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -62,7 +62,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -214,7 +214,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -362,7 +362,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -461,7 +461,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -558,7 +558,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -1,105 +0,0 @@
-name: Build build-tools image
-
-on:
-  workflow_call:
-    inputs:
-      image-tag:
-        description: "build-tools image tag"
-        required: true
-        type: string
-    outputs:
-      image-tag:
-        description: "build-tools tag"
-        value: ${{ inputs.image-tag }}
-      image:
-        description: "build-tools image"
-        value: neondatabase/build-tools:${{ inputs.image-tag }}
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-concurrency:
-  group: build-build-tools-image-${{ inputs.image-tag }}
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-jobs:
-  check-image:
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
-  build-image:
-    needs: [ check-image ]
-    if: needs.check-image.outputs.found == 'false'
-
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
-
-    env:
-      IMAGE_TAG: ${{ inputs.image-tag }}
-
-    steps:
-      - name: Check `input.tag` is correct
-        env:
-          INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
-          CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
-        run: |
-          if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
-            echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
-            exit 1
-          fi
-
-      - uses: actions/checkout@v3
-
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p /tmp/.docker-custom
-          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/setup-buildx-action@v2
-
-      - uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/build-push-action@v4
-        with:
-          context: .
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.build-tools
-          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
-          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
-
-      - name: Remove custom docker config directory
-        run: |
-          rm -rf /tmp/.docker-custom
-
-  merge-images:
-    needs: [ build-image ]
-    runs-on: ubuntu-latest
-
-    env:
-      IMAGE_TAG: ${{ inputs.image-tag }}
-
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Create multi-arch image
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
-                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
-                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -0,0 +1,124 @@
+name: Build and Push Docker Image
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile-path:
+        required: true
+        type: string
+      image-name:
+        required: true
+        type: string
+    outputs:
+      build-tools-tag:
+        description: "tag generated for build tools"
+        value: ${{ jobs.tag.outputs.build-tools-tag }}
+
+jobs:
+  check-if-build-tools-dockerfile-changed:
+    runs-on: ubuntu-latest
+    outputs:
+      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
+    steps:
+      - name: Check if Dockerfile.buildtools has changed
+        id: dockerfile
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
+            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
+            exit
+          fi
+          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
+          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
+            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  tag:
+    runs-on: ubuntu-latest
+    needs: [ check-if-build-tools-dockerfile-changed ]
+    outputs:
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
+
+    steps:
+      - name: Get buildtools tag
+        env:
+          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
+  kaniko:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, x64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+
+  kaniko-arm:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, arm64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+  manifest:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    name: 'manifest'
+    runs-on: [ self-hosted, dev, x64 ]
+    needs:
+      - tag
+      - kaniko
+      - kaniko-arm
+      - check-if-build-tools-dockerfile-changed
+
+    steps:
+      - name: Create manifest
+        run: |
+          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+      - name: Push manifest
+        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -77,25 +77,19 @@ jobs:
        shell: bash
        id: build-tag

-  check-build-tools-image:
+  build-buildtools-image:
    needs: [ check-permissions ]
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  build-build-tools-image:
-    needs: [ check-build-tools-image ]
-    uses: ./.github/workflows/build-build-tools-image.yml
+    uses: ./.github/workflows/build_and_push_docker_image.yml
    with:
-      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+      dockerfile-path: Dockerfile.buildtools
+      image-name: build-tools
    secrets: inherit

  check-codestyle-python:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init

    steps:
@@ -124,13 +118,10 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init

    steps:
@@ -194,13 +185,10 @@ jobs:
        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions, tag, build-build-tools-image ]
+    needs: [ check-permissions, tag, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      # Raise locked memory limit for tokio-epoll-uring.
      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
      # io_uring will account the memory of the CQ and SQ as locked.
@@ -438,13 +426,10 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
@@ -488,13 +473,10 @@ jobs:
  get-benchmarks-durations:
    outputs:
      json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
@@ -521,13 +503,10 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -559,15 +538,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}

    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init

    steps:
@@ -608,13 +584,10 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests, build-build-tools-image ]
+    needs: [ check-permissions, regress-tests, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init
    strategy:
      fail-fast: false
@@ -718,7 +691,7 @@ jobs:
    secrets: inherit

  neon-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]

    steps:
@@ -753,7 +726,8 @@ jobs:
          build-args: |
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
          provenance: false
          push: true
          pull: true
@@ -769,8 +743,61 @@ jobs:
        run: |
          rm -rf .docker-custom

+  compute-tools-image:
+    runs-on: [ self-hosted, gen3, large ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-tools
+          cache-from: type=registry,ref=neondatabase/compute-tools:cache
+          cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  compute-node-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]

    strategy:
@@ -810,15 +837,15 @@ jobs:
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Build compute-node image
-        uses: docker/build-push-action@v5
+      - uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
          provenance: false
          push: true
          pull: true
@@ -829,25 +856,6 @@ jobs:
            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

-      - name: Build compute-tools image
-        # compute-tools are Postgres independent, so build it only once
-        if: ${{ matrix.version == 'v16' }}
-        uses: docker/build-push-action@v5
-        with:
-          target: compute-tools-image
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-node
-          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
      - name: Remove custom docker config directory
        if: always()
        run: |
@@ -895,7 +903,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ check-permissions, tag, neon-image, compute-node-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -929,8 +937,7 @@ jobs:
          fi

      - name: Verify docker-compose example
-        timeout-minutes: 20
-        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh

      - name: Print logs and clean up
        if: always()
@@ -1210,11 +1217,3 @@ jobs:

            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
          done
-
-  pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images, regress-tests ]
-    if: github.ref_name == 'main'
-    uses: ./.github/workflows/pin-build-tools-image.yml
-    with:
-      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
-    secrets: inherit
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -1,58 +0,0 @@
-name: Check build-tools image
-
-on:
-  workflow_call:
-    outputs:
-      image-tag:
-        description: "build-tools image tag"
-        value: ${{ jobs.check-image.outputs.tag }}
-      found:
-        description: "Whether the image is found in the registry"
-        value: ${{ jobs.check-image.outputs.found }}
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-jobs:
-  check-image:
-    runs-on: ubuntu-latest
-    outputs:
-      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
-      found: ${{ steps.check-image.outputs.found }}
-
-    steps:
-      - name: Get build-tools image tag for the current commit
-        id: get-build-tools-tag
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          LAST_BUILD_TOOLS_SHA=$(
-            gh api \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" \
-              --method GET \
-              --field path=Dockerfile.build-tools \
-              --field sha=${COMMIT_SHA} \
-              --field per_page=1 \
-              --jq ".[0].sha" \
-              "/repos/${GITHUB_REPOSITORY}/commits"
-          )
-          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
-
-      - name: Check if such tag found in the registry
-        id: check-image
-        env:
-          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
-        run: |
-          if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
-            found=true
-          else
-            found=false
-          fi
-
-          echo "found=${found}" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -1,32 +0,0 @@
-# A workflow from
-# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
-
-name: cleanup caches by a branch
-on:
-  pull_request:
-    types:
-      - closed
-
-jobs:
-  cleanup:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Cleanup
-        run: |
-          gh extension install actions/gh-actions-cache
-
-          echo "Fetching list of cache key"
-          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
-
-          ## Setting this to not fail the workflow while deleting cache keys.
-          set +e
-          echo "Deleting caches..."
-          for cacheKey in $cacheKeysForPR
-          do
-              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
-          done
-          echo "Done"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO: ${{ github.repository }}
-          BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,17 +26,6 @@ jobs:
    with:
      github-event-name: ${{ github.event_name}}

-  check-build-tools-image:
-    needs: [ check-permissions ]
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  build-build-tools-image:
-    needs: [ check-build-tools-image ]
-    uses: ./.github/workflows/build-build-tools-image.yml
-    with:
-      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
-    secrets: inherit
-
  check-macos-build:
    needs: [ check-permissions ]
    if: |
@@ -134,7 +123,7 @@ jobs:
        run: ./run_clippy.sh

  check-linux-arm-build:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions ]
    timeout-minutes: 90
    runs-on: [ self-hosted, dev, arm64 ]

@@ -148,10 +137,7 @@ jobs:
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -258,15 +244,12 @@ jobs:
          cargo nextest run --package remote_storage --test test_real_azure

  check-codestyle-rust-arm:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions ]
    timeout-minutes: 90
    runs-on: [ self-hosted, dev, arm64 ]

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -333,17 +316,14 @@ jobs:
        run: cargo deny check

  gather-rust-build-stats:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    env:
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -1,72 +0,0 @@
-name: 'Pin build-tools image'
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-  workflow_call:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-concurrency:
-  group: pin-build-tools-image-${{ inputs.from-tag }}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: ubuntu-latest
-
-    env:
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: pinned
-
-    steps:
-      - name: Check if we really need to pin the image
-        id: check-manifests
-        run: |
-          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
-          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
-
-          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
-            skip=true
-          else
-            skip=false
-          fi
-
-          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
-
-      - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
-
-      - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -0,0 +1,70 @@
+name: 'Update build tools image tag'
+
+# This workflow it used to update tag of build tools in ECR.
+# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+      to-tag:
+        description: 'Destination tag'
+        required: true
+        type: string
+        default: 'pinned'
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: [ self-hosted, gen3, small ]
+
+    env:
+      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+
+    steps:
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v2
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install crane
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
+
+      - name: Copy images
+        run: |
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,6 @@ test_output/
 neon.iml
 /.neon
 /integration_tests/.neon
-compaction-suite-results.*

 # Coverage
 *.profraw
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -74,11 +74,16 @@ We're using the following approach to make it work:

 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)

-## How do I make build-tools image "pinned"
+## How do I add the "pinned" tag to an buildtools image?
+We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.

-It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
+You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
+or using GitHub CLI:

 ```bash
-gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
-            -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
-```
+gh workflow -R neondatabase/neon run update_build_tools_image.yml \
+            -f from-tag=6254913013 \
+            -f to-tag=pinned \
+
+# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
+```
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"

 [[package]]
 name = "ahash"
-version = "0.8.9"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
+checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
 dependencies = [
 "cfg-if",
 "const-random",
@@ -1389,9 +1389,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
 dependencies = [
 "rustc_version",
 ]
@@ -3498,7 +3498,6 @@ dependencies = [
 "num_cpus",
 "once_cell",
 "pageserver_api",
- "pageserver_compaction",
 "pin-project-lite",
 "postgres",
 "postgres-protocol",
@@ -3589,53 +3588,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "pageserver_compaction"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-compression",
- "async-stream",
- "async-trait",
- "byteorder",
- "bytes",
- "chrono",
- "clap",
- "const_format",
- "consumption_metrics",
- "criterion",
- "crossbeam-utils",
- "either",
- "fail",
- "flate2",
- "futures",
- "git-version",
- "hex",
- "hex-literal",
- "humantime",
- "humantime-serde",
- "itertools",
- "metrics",
- "once_cell",
- "pageserver_api",
- "pin-project-lite",
- "rand 0.8.5",
- "smallvec",
- "svg_fmt",
- "sync_wrapper",
- "thiserror",
- "tokio",
- "tokio-io-timeout",
- "tokio-util",
- "tracing",
- "tracing-error",
- "tracing-subscriber",
- "url",
- "utils",
- "walkdir",
- "workspace_hack",
-]
-
 [[package]]
 name = "parking"
 version = "2.1.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,6 @@ members = [
    "control_plane",
    "control_plane/attachment_service",
    "pageserver",
-    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
@@ -200,7 +199,6 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
-pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -891,17 +891,7 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Final compute-tools image
-#
-#########################################################################################
-
-FROM debian:bullseye-slim AS compute-tools-image
-
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
+RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

 #########################################################################################
 #
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -0,0 +1,32 @@
+# First transient image to build compute_tools binaries
+# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
+ARG REPOSITORY=neondatabase
+ARG IMAGE=build-tools
+ARG TAG=pinned
+ARG BUILD_TAG
+
+FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
+WORKDIR /home/nonroot
+
+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
+ARG RUSTC_WRAPPER=cachepot
+ENV AWS_REGION=eu-central-1
+ENV CACHEPOT_S3_KEY_PREFIX=cachepot
+ARG CACHEPOT_BUCKET=neon-github-dev
+#ARG AWS_ACCESS_KEY_ID
+#ARG AWS_SECRET_ACCESS_KEY
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG
+
+COPY . .
+
+RUN set -e \
+    && mold -run cargo build -p compute_tools --locked --release \
+    && cachepot -s
+
+# Final image that only has one binary
+FROM debian:bullseye-slim
+
+COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

 ## Quick start
-Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.

 Alternatively, compile and run the project [locally](#running-local-installation).

@@ -230,12 +230,6 @@ postgres=# select * from t;
 > cargo neon stop
 ```

-More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
-
-#### Handling build failures
-
-If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
-
 ## Running tests

 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -265,12 +259,6 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 > It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
 > See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).

-## Cleanup
-
-For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
-
-For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
-
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,6 +18,8 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
+use tokio;
+use tokio_postgres;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::Result;
+use anyhow::{self, Result};
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,6 +13,8 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use num_cpus;
+use serde_json;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -676,15 +676,8 @@ pub fn handle_grants(
                            GRANT CREATE ON SCHEMA public TO web_access;\n\
                        END IF;\n\
                    END IF;\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    )\n\
-                    THEN\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
-                    END IF;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
                END\n\
            $$;"
        .to_string();
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -1,26 +0,0 @@
-# Control Plane and Neon Local
-
-This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
-
-## Example: Start with Postgres 16
-
-To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
-
-```shell
-cargo neon init --pg-version 16
-cargo neon start
-cargo neon tenant create --set-default --pg-version 16
-cargo neon endpoint create main --pg-version 16
-cargo neon endpoint start main
-```
-
-## Example: Create Test User and Database
-
-By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
-
-```shell
-cargo neon endpoint create main --pg-version 16 --update-catalog true
-cargo neon endpoint start main --create-test-user true
-```
-
-The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
@@ -1,2 +0,0 @@
-ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
-ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
@@ -1,4 +0,0 @@
-
-
-ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
-ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
@@ -1,9 +0,0 @@
-use utils::auth::{AuthError, Claims, Scope};
-
-pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
-    if claims.scope != required_scope {
-        return Err(AuthError("Scope mismatch. Permission denied".into()));
-    }
-
-    Ok(())
-}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,18 +1,17 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use utils::auth::{Scope, SwappableJwtAuth};
-use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
+use utils::auth::SwappableJwtAuth;
+use utils::http::endpoint::{auth_middleware, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

@@ -26,12 +25,12 @@ use utils::{
    id::NodeId,
 };

-use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
-};
-use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
+use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};

-use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
+use control_plane::attachment_service::{
+    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
+    TenantShardMigrateRequest,
+};

 /// State available to HTTP request handlers
 #[derive(Clone)]
@@ -65,8 +64,6 @@ fn get_state(request: &Request<Body>) -> &HttpState {

 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::GenerationsApi)?;
-
    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -75,8 +72,6 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::GenerationsApi)?;
-
    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -86,8 +81,6 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
 async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
    let state = get_state(&req);

@@ -102,8 +95,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 }

 async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let inspect_req = json_request::<InspectRequest>(&mut req).await?;

    let state = get_state(&req);
@@ -115,17 +106,10 @@ async fn handle_tenant_create(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-
-    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-    // have no expectation of HA).
-    let placement_policy = PlacementPolicy::Single;
-
    json_response(
        StatusCode::CREATED,
-        service.tenant_create(create_req, placement_policy).await?,
+        service.tenant_create(create_req).await?,
    )
 }

@@ -180,8 +164,6 @@ async fn handle_tenant_location_config(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
    json_response(
        StatusCode::OK,
@@ -191,34 +173,11 @@ async fn handle_tenant_location_config(
    )
 }

-async fn handle_tenant_config_set(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
-
-    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
-}
-
-async fn handle_tenant_config_get(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
-}
-
 async fn handle_tenant_time_travel_remote_storage(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;

    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -243,15 +202,7 @@ async fn handle_tenant_time_travel_remote_storage(
            done_if_after_raw,
        )
        .await?;
-    json_response(StatusCode::OK, ())
-}

-async fn handle_tenant_secondary_download(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    service.tenant_secondary_download(tenant_id).await?;
    json_response(StatusCode::OK, ())
 }

@@ -260,7 +211,6 @@ async fn handle_tenant_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;

    deletion_wrapper(service, move |service| async move {
        service.tenant_delete(tenant_id).await
@@ -273,8 +223,6 @@ async fn handle_tenant_timeline_create(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
    json_response(
        StatusCode::CREATED,
@@ -289,8 +237,6 @@ async fn handle_tenant_timeline_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    deletion_wrapper(service, move |service| async move {
@@ -304,7 +250,6 @@ async fn handle_tenant_timeline_passthrough(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;

    let Some(path) = req.uri().path_and_query() else {
        // This should never happen, our request router only calls us if there is a path
@@ -348,15 +293,11 @@ async fn handle_tenant_locate(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
    let state = get_state(&req);
    state.service.node_register(register_req).await?;
@@ -364,23 +305,17 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 }

 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }

 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
    if node_id != config_req.node_id {
@@ -400,8 +335,6 @@ async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;

@@ -415,8 +348,6 @@ async fn handle_tenant_shard_migrate(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
    json_response(
@@ -429,30 +360,22 @@ async fn handle_tenant_shard_migrate(

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }

 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    state.service.tenants_dump()
 }

 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    state.service.scheduler_dump()
 }

 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -509,12 +432,6 @@ where
    .await
 }

-fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
-    check_permission_with(request, |claims| {
-        crate::auth::check_permission(claims, required_scope)
-    })
-}
-
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
@@ -586,21 +503,12 @@ pub fn make_router(
        .delete("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(r, handle_tenant_delete)
        })
-        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set)
-        })
-        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get)
-        })
        .put("/v1/tenant/:tenant_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
-        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(r, handle_tenant_secondary_download)
-        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(r, handle_tenant_timeline_delete)
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,7 +1,6 @@
 use serde::{Deserialize, Serialize};
 use utils::seqwait::MonotonicCounter;

-mod auth;
 mod compute_hook;
 pub mod http;
 pub mod metrics;
@@ -13,20 +12,14 @@ mod schema;
 pub mod service;
 mod tenant_state;

-#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
+#[derive(Clone, Serialize, Deserialize, Debug)]
 enum PlacementPolicy {
    /// Cheapest way to attach a tenant: just one pageserver, no secondary
    Single,
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
-    /// Create one secondary mode locations. This is useful when onboarding
-    /// a tenant, or for an idle tenant that we might want to bring online quickly.
-    Secondary,
-
-    /// Do not attach to any pageservers.  This is appropriate for tenants that
-    /// have been idle for a long time, where we do not mind some delay in making
-    /// them available in future.
+    /// Do not attach to any pageservers
    Detached,
 }

--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
-use aws_config::{BehaviorVersion, Region};
+use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -79,38 +79,13 @@ impl Secrets {
        "neon-storage-controller-control-plane-jwt-token";
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";

-    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
-    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
-    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
-    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
-
-    /// Load secrets from, in order of preference:
-    /// - CLI args if database URL is provided on the CLI
-    /// - Environment variables if DATABASE_URL is set.
-    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
        match &args.database_url {
            Some(url) => Self::load_cli(url, args),
-            None => match std::env::var(Self::DATABASE_URL_ENV) {
-                Ok(database_url) => Self::load_env(database_url),
-                Err(_) => Self::load_aws_sm().await,
-            },
+            None => Self::load_aws_sm().await,
        }
    }

-    fn load_env(database_url: String) -> anyhow::Result<Self> {
-        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
-            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
-            Err(_) => None,
-        };
-        Ok(Self {
-            database_url,
-            public_key,
-            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
-            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
-        })
-    }
-
    async fn load_aws_sm() -> anyhow::Result<Self> {
        let Ok(region) = std::env::var("AWS_REGION") else {
            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,4 +1,4 @@
-use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use serde::Serialize;
 use utils::id::NodeId;

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,12 +6,10 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
+use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
-use diesel::{
-    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
-    Selectable, SelectableHelper,
-};
-use pageserver_api::controller_api::NodeSchedulingPolicy;
+use diesel::prelude::*;
+use diesel::Connection;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
@@ -333,15 +331,7 @@ impl Persistence {
                shard_number: ShardNumber(tsp.shard_number as u8),
                shard_count: ShardCount::new(tsp.shard_count as u8),
            };
-
-            let Some(g) = tsp.generation else {
-                // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
-                // we only set generation_pageserver when setting generation.
-                return Err(DatabaseError::Logical(
-                    "Generation should always be set after incrementing".to_string(),
-                ));
-            };
-            result.insert(tenant_shard_id, Generation::new(g as u32));
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
        }

        Ok(result)
@@ -374,85 +364,7 @@ impl Persistence {
            })
            .await?;

-        // Generation is always non-null in the rseult: if the generation column had been NULL, then we
-        // should have experienced an SQL Confilict error while executing a query that tries to increment it.
-        debug_assert!(updated.generation.is_some());
-        let Some(g) = updated.generation else {
-            return Err(DatabaseError::Logical(
-                "Generation should always be set after incrementing".to_string(),
-            )
-            .into());
-        };
-
-        Ok(Generation::new(g as u32))
-    }
-
-    /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
-    ///
-    /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
-    /// API: use [`Self::increment_generation`] instead.  Setting the generation via this route is a one-time thing
-    /// that we only do the first time a tenant is set to an attached policy via /location_config.
-    pub(crate) async fn update_tenant_shard(
-        &self,
-        tenant_shard_id: TenantShardId,
-        input_placement_policy: PlacementPolicy,
-        input_config: TenantConfig,
-        input_generation: Option<Generation>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_conn(move |conn| {
-            let query = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
-
-            if let Some(input_generation) = input_generation {
-                // Update includes generation column
-                query
-                    .set((
-                        generation.eq(Some(input_generation.into().unwrap() as i32)),
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            } else {
-                // Update does not include generation column
-                query
-                    .set((
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            }
-
-            Ok(())
-        })
-        .await?;
-
-        Ok(())
-    }
-
-    pub(crate) async fn update_tenant_config(
-        &self,
-        input_tenant_id: TenantId,
-        input_config: TenantConfig,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_conn(move |conn| {
-            diesel::update(tenant_shards)
-                .filter(tenant_id.eq(input_tenant_id.to_string()))
-                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
-                .execute(conn)?;
-
-            Ok(())
-        })
-        .await?;
-
-        Ok(())
+        Ok(Generation::new(updated.generation as u32))
    }

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -463,7 +375,7 @@ impl Persistence {
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
+                    generation_pageserver.eq(i64::MAX),
                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                ))
                .execute(conn)?;
@@ -589,15 +501,12 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) shard_stripe_size: i32,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching.
-    //
-    // Generation is only None when first onboarding a tenant, where it may
-    // be in PlacementPolicy::Secondary and therefore have no valid generation state.
-    pub(crate) generation: Option<i32>,
+    // and use the incremented number when attaching
+    pub(crate) generation: i32,

    // Currently attached pageserver
    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<i64>,
+    pub(crate) generation_pageserver: i64,

    #[serde(default)]
    pub(crate) placement_policy: String,
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
-use pageserver_api::controller_api::NodeAvailability;
+use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
-    pub(crate) generation: Option<Generation>,
+    pub(crate) generation: Generation,
    pub(crate) intent: TargetState,
    pub(crate) config: TenantConfig,
    pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedStale,
-            self.generation,
+            Some(self.generation),
            None,
        );
        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,17 +335,16 @@ impl Reconciler {
        }

        // Increment generation before attaching to new pageserver
-        self.generation = Some(
-            self.persistence
-                .increment_generation(self.tenant_shard_id, dest_ps_id)
-                .await?,
-        );
+        self.generation = self
+            .persistence
+            .increment_generation(self.tenant_shard_id, dest_ps_id)
+            .await?;

        let dest_conf = build_location_config(
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedMulti,
-            self.generation,
+            Some(self.generation),
            None,
        );

@@ -402,7 +401,7 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedSingle,
-            self.generation,
+            Some(self.generation),
            None,
        );
        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -434,62 +433,22 @@ impl Reconciler {

        // If the attached pageserver is not attached, do so now.
        if let Some(node_id) = self.intent.attached {
-            // If we are in an attached policy, then generation must have been set (null generations
-            // are only present when a tenant is initially loaded with a secondary policy)
-            debug_assert!(self.generation.is_some());
-            let Some(generation) = self.generation else {
-                return Err(ReconcileError::Other(anyhow::anyhow!(
-                    "Attempted to attach with NULL generation"
-                )));
-            };
-
-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf =
+                attached_location_conf(self.generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
                    tracing::info!(%node_id, "Observed configuration already correct.")
                }
-                observed => {
+                _ => {
                    // In all cases other than a matching observed configuration, we will
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.
-
-                    // The general case is to increment the generation.  However, there are cases
-                    // where this is not necessary:
-                    // - if we are only updating the TenantConf part of the location
-                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
-                    //   and the location was already in the correct generation
-                    let increment_generation = match observed {
-                        None => true,
-                        Some(ObservedStateLocation { conf: None }) => true,
-                        Some(ObservedStateLocation {
-                            conf: Some(observed),
-                        }) => {
-                            let generations_match = observed.generation == wanted_conf.generation;
-
-                            use LocationConfigMode::*;
-                            let mode_transition_requires_gen_inc =
-                                match (observed.mode, wanted_conf.mode) {
-                                    // Usually the short-lived attachment modes (multi and stale) are only used
-                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
-                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
-                                    (AttachedSingle, AttachedStale) => false,
-                                    (AttachedMulti, AttachedSingle) => false,
-                                    (lhs, rhs) => lhs != rhs,
-                                };
-
-                            !generations_match || mode_transition_requires_gen_inc
-                        }
-                    };
-
-                    if increment_generation {
-                        let generation = self
-                            .persistence
-                            .increment_generation(self.tenant_shard_id, node_id)
-                            .await?;
-                        self.generation = Some(generation);
-                        wanted_conf.generation = generation.into();
-                    }
+                    self.generation = self
+                        .persistence
+                        .increment_generation(self.tenant_shard_id, node_id)
+                        .await?;
+                    wanted_conf.generation = self.generation.into();
                    tracing::info!(%node_id, "Observed configuration requires update.");
                    self.location_config(node_id, wanted_conf, None).await?;
                    self.compute_notify().await?;
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -255,7 +255,7 @@ impl Scheduler {
 pub(crate) mod test_utils {

    use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
+    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -284,6 +284,7 @@ pub(crate) mod test_utils {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use utils::id::NodeId;

    use crate::tenant_state::IntentState;
    #[test]
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -17,8 +17,8 @@ diesel::table! {
        shard_number -> Int4,
        shard_count -> Int4,
        shard_stripe_size -> Int4,
-        generation -> Nullable<Int4>,
-        generation_pageserver -> Nullable<Int8>,
+        generation -> Int4,
+        generation_pageserver -> Int8,
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -9,20 +9,19 @@ use std::{

 use anyhow::Context;
 use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
+    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
+    TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
+    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
-    controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+    control_api::{
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateResponse, ValidateResponseTenant,
    },
-    models::TenantConfigRequest,
-};
-use pageserver_api::{
    models::{
        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
@@ -30,10 +29,6 @@ use pageserver_api::{
        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
-    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateResponse, ValidateResponseTenant,
-    },
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
@@ -68,11 +63,6 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 // some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

-// If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
-// tenant shards into this generation, and as long as it remains in this generation, we will accept
-// input generation from future requests as authoritative.
-const INITIAL_GENERATION: Generation = Generation::new(0);
-
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -175,21 +165,6 @@ impl From<ReconcileWaitError> for ApiError {
    }
 }

-#[allow(clippy::large_enum_variant)]
-enum TenantCreateOrUpdate {
-    Create((TenantCreateRequest, PlacementPolicy)),
-    Update(Vec<ShardUpdate>),
-}
-
-struct ShardUpdate {
-    tenant_shard_id: TenantShardId,
-    placement_policy: PlacementPolicy,
-    tenant_config: TenantConfig,
-
-    /// If this is None, generation is not updated.
-    generation: Option<Generation>,
-}
-
 impl Service {
    pub fn get_config(&self) -> &Config {
        &self.config
@@ -594,9 +569,6 @@ impl Service {
        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
        tenant.pending_compute_notification = result.pending_compute_notification;

-        // Let the TenantState know it is idle.
-        tenant.reconcile_complete(result.sequence);
-
        match result.result {
            Ok(()) => {
                for (node_id, loc) in &result.observed.locations {
@@ -687,8 +659,8 @@ impl Service {
            // after when pageservers start up and register.
            let mut node_ids = HashSet::new();
            for tsp in &tenant_shard_persistence {
-                if let Some(node_id) = tsp.generation_pageserver {
-                    node_ids.insert(node_id);
+                if tsp.generation_pageserver != i64::MAX {
+                    node_ids.insert(tsp.generation_pageserver);
                }
            }
            for node_id in node_ids {
@@ -725,15 +697,18 @@ impl Service {
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
-            if let Some(generation_pageserver) = tsp.generation_pageserver {
-                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
+            if tsp.generation_pageserver != i64::MAX {
+                intent.set_attached(
+                    &mut scheduler,
+                    Some(NodeId(tsp.generation_pageserver as u64)),
+                );
            }

            let new_tenant = TenantState {
                tenant_shard_id,
                shard: shard_identity,
                sequence: Sequence::initial(),
-                generation: tsp.generation.map(|g| Generation::new(g as u32)),
+                generation: Generation::new(tsp.generation as u32),
                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
                intent,
                observed: ObservedState::new(),
@@ -813,8 +788,8 @@ impl Service {
                shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: 0,
-                generation: Some(0),
-                generation_pageserver: None,
+                generation: 0,
+                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
@@ -869,7 +844,7 @@ impl Service {
            .expect("Checked for existence above");

        if let Some(new_generation) = new_generation {
-            tenant_state.generation = Some(new_generation);
+            tenant_state.generation = new_generation;
        } else {
            // This is a detach notification.  We must update placement policy to avoid re-attaching
            // during background scheduling/reconciliation, or during attachment service restart.
@@ -919,7 +894,7 @@ impl Service {
                    node_id,
                    ObservedStateLocation {
                        conf: Some(attached_location_conf(
-                            tenant_state.generation.unwrap(),
+                            tenant_state.generation,
                            &tenant_state.shard,
                            &tenant_state.config,
                        )),
@@ -933,7 +908,7 @@ impl Service {
        Ok(AttachHookResponse {
            gen: attach_req
                .node_id
-                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
+                .map(|_| tenant_state.generation.into().unwrap()),
        })
    }

@@ -946,7 +921,7 @@ impl Service {
            attachment: tenant_state.and_then(|s| {
                s.intent
                    .get_attached()
-                    .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
+                    .map(|ps| (s.generation.into().unwrap(), ps))
            }),
        }
    }
@@ -996,17 +971,7 @@ impl Service {
                continue;
            };

-            // If [`Persistence::re_attach`] selected this shard, it must have alread
-            // had a generation set.
-            debug_assert!(shard_state.generation.is_some());
-            let Some(old_gen) = shard_state.generation else {
-                // Should never happen:  would only return incremented generation
-                // for a tenant that already had a non-null generation.
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "Generation must be set while re-attaching"
-                )));
-            };
-            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
+            shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
            if let Some(observed) = shard_state
                .observed
                .locations
@@ -1036,7 +1001,7 @@ impl Service {

        for req_tenant in validate_req.tenants {
            if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
+                let valid = tenant_state.generation == Generation::new(req_tenant.gen);
                tracing::info!(
                    "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                    req_tenant.id,
@@ -1063,9 +1028,8 @@ impl Service {
    pub(crate) async fn tenant_create(
        &self,
        create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
    ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
+        let (response, waiters) = self.do_tenant_create(create_req).await?;

        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
        Ok(response)
@@ -1074,7 +1038,6 @@ impl Service {
    pub(crate) async fn do_tenant_create(
        &self,
        create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
        // This service expects to handle sharding itself: it is an error to try and directly create
        // a particular shard here.
@@ -1100,27 +1063,9 @@ impl Service {
            })
            .collect::<Vec<_>>();

-        // If the caller specifies a None generation, it means "start from default".  This is different
-        // to [`Self::tenant_location_config`], where a None generation is used to represent
-        // an incompletely-onboarded tenant.
-        let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
-            tracing::info!(
-                "tenant_create: secondary mode, generation is_some={}",
-                create_req.generation.is_some()
-            );
-            create_req.generation.map(Generation::new)
-        } else {
-            tracing::info!(
-                "tenant_create: not secondary mode, generation is_some={}",
-                create_req.generation.is_some()
-            );
-            Some(
-                create_req
-                    .generation
-                    .map(Generation::new)
-                    .unwrap_or(INITIAL_GENERATION),
-            )
-        };
+        // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
+        // have no expectation of HA).
+        let placement_policy: PlacementPolicy = PlacementPolicy::Single;

        // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
        // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1132,10 +1077,8 @@ impl Service {
                shard_number: tenant_shard_id.shard_number.0 as i32,
                shard_count: tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: initial_generation.map(|g| g.into().unwrap() as i32),
-                // The pageserver is not known until scheduling happens: we will set this column when
-                // incrementing the generation the first time we attach to a pageserver.
-                generation_pageserver: None,
+                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
+                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
                splitting: SplitState::default(),
@@ -1175,17 +1118,15 @@ impl Service {
                            ))
                        })?;

-                        if let Some(node_id) = entry.get().intent.get_attached() {
-                            let generation = entry
+                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
+                            node_id: entry
                                .get()
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
+                                .intent
+                                .get_attached()
+                                .expect("We just set pageserver if it was None"),
+                            generation: entry.get().generation.into().unwrap(),
+                        });

                        continue;
                    }
@@ -1199,7 +1140,9 @@ impl Service {
                            placement_policy.clone(),
                        );

-                        state.generation = initial_generation;
+                        if let Some(create_gen) = create_req.generation {
+                            state.generation = Generation::new(create_gen);
+                        }
                        state.config = create_req.config.clone();

                        state.schedule(scheduler).map_err(|e| {
@@ -1208,18 +1151,14 @@ impl Service {
                            ))
                        })?;

-                        // Only include shards in result if we are attaching: the purpose
-                        // of the response is to tell the caller where the shards are attached.
-                        if let Some(node_id) = state.intent.get_attached() {
-                            let generation = state
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
+                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
+                            node_id: state
+                                .intent
+                                .get_attached()
+                                .expect("We just set pageserver if it was None"),
+                            generation: state.generation.into().unwrap(),
+                        });
                        entry.insert(state)
                    }
                };
@@ -1273,114 +1212,12 @@ impl Service {
        Ok(())
    }

-    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
-    /// and transform it into either a tenant creation of a series of shard updates.
-    fn tenant_location_config_prepare(
-        &self,
-        tenant_id: TenantId,
-        req: TenantLocationConfigRequest,
-    ) -> TenantCreateOrUpdate {
-        let mut updates = Vec::new();
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, _scheduler) = locked.parts_mut();
-
-        // Use location config mode as an indicator of policy.
-        let placement_policy = match req.config.mode {
-            LocationConfigMode::Detached => PlacementPolicy::Detached,
-            LocationConfigMode::Secondary => PlacementPolicy::Secondary,
-            LocationConfigMode::AttachedMulti
-            | LocationConfigMode::AttachedSingle
-            | LocationConfigMode::AttachedStale => {
-                if nodes.len() > 1 {
-                    PlacementPolicy::Double(1)
-                } else {
-                    // Convenience for dev/test: if we just have one pageserver, import
-                    // tenants into Single mode so that scheduling will succeed.
-                    PlacementPolicy::Single
-                }
-            }
-        };
-
-        let mut create = true;
-        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            // Saw an existing shard: this is not a creation
-            create = false;
-
-            // Shards may have initially been created by a Secondary request, where we
-            // would have left generation as None.
-            //
-            // We only update generation the first time we see an attached-mode request,
-            // and if there is no existing generation set. The caller is responsible for
-            // ensuring that no non-storage-controller pageserver ever uses a higher
-            // generation than they passed in here.
-            use LocationConfigMode::*;
-            let set_generation = match req.config.mode {
-                AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
-                    req.config.generation.map(Generation::new)
-                }
-                _ => None,
-            };
-
-            if shard.policy != placement_policy
-                || shard.config != req.config.tenant_conf
-                || set_generation.is_some()
-            {
-                updates.push(ShardUpdate {
-                    tenant_shard_id: *shard_id,
-                    placement_policy: placement_policy.clone(),
-                    tenant_config: req.config.tenant_conf.clone(),
-                    generation: set_generation,
-                });
-            }
-        }
-
-        if create {
-            use LocationConfigMode::*;
-            let generation = match req.config.mode {
-                AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
-                // If a caller provided a generation in a non-attached request, ignore it
-                // and leave our generation as None: this enables a subsequent update to set
-                // the generation when setting an attached mode for the first time.
-                _ => None,
-            };
-
-            TenantCreateOrUpdate::Create(
-                // Synthesize a creation request
-                (
-                    TenantCreateRequest {
-                        new_tenant_id: TenantShardId::unsharded(tenant_id),
-                        generation,
-                        shard_parameters: ShardParameters {
-                            // Must preserve the incoming shard_count do distinguish unsharded (0)
-                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                            count: req.tenant_id.shard_count,
-                            // We only import un-sharded or single-sharded tenants, so stripe
-                            // size can be made up arbitrarily here.
-                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                        },
-                        config: req.config.tenant_conf,
-                    },
-                    placement_policy,
-                ),
-            )
-        } else {
-            TenantCreateOrUpdate::Update(updates)
-        }
-    }
-
-    /// This API is used by the cloud control plane to migrate unsharded tenants that it created
-    /// directly with pageservers into this service.
-    ///
-    /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
-    /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
-    /// Think of the first attempt to call this API as a transfer of absolute authority over the
-    /// tenant's source of generation numbers.
-    ///
-    /// The mode in this request coarse-grained control of tenants:
+    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
    /// - Call with mode Attached* to upsert the tenant.
-    /// - Call with mode Secondary to either onboard a tenant without attaching it, or
-    ///   to set an existing tenant to PolicyMode::Secondary
    /// - Call with mode Detached to switch to PolicyMode::Detached
+    ///
+    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
+    /// secondary locations.
    pub(crate) async fn tenant_location_config(
        &self,
        tenant_id: TenantId,
@@ -1392,96 +1229,131 @@ impl Service {
            )));
        }

-        // First check if this is a creation or an update
-        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
-
+        let mut waiters = Vec::new();
        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
-        let waiters = match create_or_update {
-            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
-                let (create_resp, waiters) =
-                    self.do_tenant_create(create_req, placement_policy).await?;
-                result.shards = create_resp
-                    .shards
-                    .into_iter()
-                    .map(|s| TenantShardLocation {
-                        node_id: s.node_id,
-                        shard_id: s.shard_id,
-                    })
-                    .collect();
-                waiters
-            }
-            TenantCreateOrUpdate::Update(updates) => {
-                // Persist updates
-                // Ordering: write to the database before applying changes in-memory, so that
-                // we will not appear time-travel backwards on a restart.
-                for ShardUpdate {
-                    tenant_shard_id,
-                    placement_policy,
-                    tenant_config,
-                    generation,
-                } in &updates
-                {
-                    self.persistence
-                        .update_tenant_shard(
-                            *tenant_shard_id,
-                            placement_policy.clone(),
-                            tenant_config.clone(),
-                            *generation,
-                        )
-                        .await?;
-                }
+        let maybe_create = {
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();

-                // Apply updates in-memory
-                let mut waiters = Vec::new();
-                {
-                    let mut locked = self.inner.write().unwrap();
-                    let result_tx = locked.result_tx.clone();
-                    let compute_hook = locked.compute_hook.clone();
-                    let (nodes, tenants, scheduler) = locked.parts_mut();
+            // Maybe we have existing shards
+            let mut create = true;
+            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+                // Saw an existing shard: this is not a creation
+                create = false;

-                    for ShardUpdate {
-                        tenant_shard_id,
-                        placement_policy,
-                        tenant_config,
-                        generation: update_generation,
-                    } in updates
-                    {
-                        let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
-                            tracing::warn!("Shard {tenant_shard_id} removed while updating");
-                            continue;
-                        };
+                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
+                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
+                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
+                // cloud control plane into this service.

-                        shard.policy = placement_policy;
-                        shard.config = tenant_config;
-                        if let Some(generation) = update_generation {
-                            shard.generation = Some(generation);
-                        }
-
-                        shard.schedule(scheduler)?;
-
-                        let maybe_waiter = shard.maybe_reconcile(
-                            result_tx.clone(),
-                            nodes,
-                            &compute_hook,
-                            &self.config,
-                            &self.persistence,
-                            &self.gate,
-                            &self.cancel,
-                        );
-                        if let Some(waiter) = maybe_waiter {
-                            waiters.push(waiter);
-                        }
-
-                        if let Some(node_id) = shard.intent.get_attached() {
-                            result.shards.push(TenantShardLocation {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                            })
+                // Use location config mode as an indicator of policy: if they ask for
+                // attached we go to default HA attached mode.  If they ask for secondary
+                // we go to secondary-only mode.  If they ask for detached we detach.
+                match req.config.mode {
+                    LocationConfigMode::Detached => {
+                        shard.policy = PlacementPolicy::Detached;
+                    }
+                    LocationConfigMode::Secondary => {
+                        // TODO: implement secondary-only mode.
+                        todo!();
+                    }
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // TODO: persistence for changes in policy
+                        if nodes.len() > 1 {
+                            shard.policy = PlacementPolicy::Double(1)
+                        } else {
+                            // Convenience for dev/test: if we just have one pageserver, import
+                            // tenants into Single mode so that scheduling will succeed.
+                            shard.policy = PlacementPolicy::Single
                        }
                    }
                }
-                waiters
+
+                shard.schedule(scheduler)?;
+
+                let maybe_waiter = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    nodes,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
+                );
+                if let Some(waiter) = maybe_waiter {
+                    waiters.push(waiter);
+                }
+
+                if let Some(node_id) = shard.intent.get_attached() {
+                    result.shards.push(TenantShardLocation {
+                        shard_id: *shard_id,
+                        node_id: *node_id,
+                    })
+                }
            }
+
+            if create {
+                // Validate request mode
+                match req.config.mode {
+                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                        // When using this API to onboard an existing tenant to this service, it must start in
+                        // an attached state, because we need the request to come with a generation
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Imported tenant must be in attached mode"
+                        )));
+                    }
+
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // Pass
+                    }
+                }
+
+                // Validate request generation
+                let Some(generation) = req.config.generation else {
+                    // We can only import attached tenants, because we need the request to come with a generation
+                    return Err(ApiError::BadRequest(anyhow::anyhow!(
+                        "Generation is mandatory when importing tenant"
+                    )));
+                };
+
+                // Synthesize a creation request
+                Some(TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: Some(generation),
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                    },
+                    config: req.config.tenant_conf,
+                })
+            } else {
+                None
+            }
+        };
+
+        let waiters = if let Some(create_req) = maybe_create {
+            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
+            result.shards = create_resp
+                .shards
+                .into_iter()
+                .map(|s| TenantShardLocation {
+                    node_id: s.node_id,
+                    shard_id: s.shard_id,
+                })
+                .collect();
+            waiters
+        } else {
+            waiters
        };

        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1501,91 +1373,6 @@ impl Service {
        Ok(result)
    }

-    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
-        let tenant_id = req.tenant_id;
-        let config = req.config;
-
-        self.persistence
-            .update_tenant_config(req.tenant_id, config.clone())
-            .await?;
-
-        let waiters = {
-            let mut waiters = Vec::new();
-            let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-            let (nodes, tenants, _scheduler) = locked.parts_mut();
-            for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-                shard.config = config.clone();
-                if let Some(waiter) = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                ) {
-                    waiters.push(waiter);
-                }
-            }
-            waiters
-        };
-
-        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
-            // Treat this as success because we have stored the configuration.  If e.g.
-            // a node was unavailable at this time, it should not stop us accepting a
-            // configuration change.
-            tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) fn tenant_config_get(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
-        let config = {
-            let locked = self.inner.read().unwrap();
-
-            match locked
-                .tenants
-                .range(TenantShardId::tenant_range(tenant_id))
-                .next()
-            {
-                Some((_tenant_shard_id, shard)) => shard.config.clone(),
-                None => {
-                    return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Tenant not found").into(),
-                    ))
-                }
-            }
-        };
-
-        // Unlike the pageserver, we do not have a set of global defaults: the config is
-        // entirely per-tenant.  Therefore the distinction between `tenant_specific_overrides`
-        // and `effective_config` in the response is meaningless, but we retain that syntax
-        // in order to remain compatible with the pageserver API.
-
-        let response = HashMap::from([
-            (
-                "tenant_specific_overrides",
-                serde_json::to_value(&config)
-                    .context("serializing tenant specific overrides")
-                    .map_err(ApiError::InternalServerError)?,
-            ),
-            (
-                "effective_config",
-                serde_json::to_value(&config)
-                    .context("serializing effective config")
-                    .map_err(ApiError::InternalServerError)?,
-            ),
-        ]);
-
-        Ok(response)
-    }
-
    pub(crate) async fn tenant_time_travel_remote_storage(
        &self,
        time_travel_req: &TenantTimeTravelRequest,
@@ -1671,60 +1458,6 @@ impl Service {
                        })?;
            }
        }
-        Ok(())
-    }
-
-    pub(crate) async fn tenant_secondary_download(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<(), ApiError> {
-        // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                for node_id in shard.intent.get_secondary() {
-                    let node = locked
-                        .nodes
-                        .get(node_id)
-                        .expect("Pageservers may not be deleted while referenced");
-
-                    targets.push((*tenant_shard_id, node.clone()));
-                }
-            }
-            targets
-        };
-
-        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
-        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
-
-        // Issue concurrent requests to all shards' locations
-        let mut futs = FuturesUnordered::new();
-        for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
-            futs.push(async move {
-                let result = client.tenant_secondary_download(tenant_shard_id).await;
-                (result, node)
-            })
-        }
-
-        // Handle any errors returned by pageservers.  This includes cases like this request racing with
-        // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
-        // well as more general cases like 503s, 500s, or timeouts.
-        while let Some((result, node)) = futs.next().await {
-            let Err(e) = result else { continue };
-
-            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
-            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
-            // than they had hoped for.
-            tracing::warn!(
-                "Ignoring tenant secondary download error from pageserver {}: {e}",
-                node.id,
-            );
-        }

        Ok(())
    }
@@ -2304,8 +2037,8 @@ impl Service {
                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                    // populate the correct generation as part of its transaction, to protect us
                    // against racing with changes in the state of the parent.
-                    generation: None,
-                    generation_pageserver: Some(target.node.id.0 as i64),
+                    generation: 0,
+                    generation_pageserver: target.node.id.0 as i64,
                    placement_policy: serde_json::to_string(&policy).unwrap(),
                    // TODO: get the config out of the map
                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2426,8 +2159,7 @@ impl Service {
                        .expect("It was present, we just split it");
                    let old_attached = old_state.intent.get_attached().unwrap();
                    old_state.intent.clear(scheduler);
-                    let generation = old_state.generation.expect("Shard must have been attached");
-                    (old_attached, generation, old_state.config.clone())
+                    (old_attached, old_state.generation, old_state.config.clone())
                };

                for child in child_ids {
@@ -2448,7 +2180,7 @@ impl Service {
                    child_state.observed = ObservedState {
                        locations: child_observed,
                    };
-                    child_state.generation = Some(generation);
+                    child_state.generation = generation;
                    child_state.config = config.clone();

                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2513,7 +2245,6 @@ impl Service {
                match shard.policy {
                    PlacementPolicy::Single => {
                        shard.intent.clear_secondary(scheduler);
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                    }
                    PlacementPolicy::Double(_n) => {
                        // If our new attached node was a secondary, it no longer should be.
@@ -2523,12 +2254,6 @@ impl Service {
                        if let Some(old_attached) = old_attached {
                            shard.intent.push_secondary(scheduler, old_attached);
                        }
-
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
-                    }
-                    PlacementPolicy::Secondary => {
-                        shard.intent.clear(scheduler);
-                        shard.intent.push_secondary(scheduler, migrate_req.node_id);
                    }
                    PlacementPolicy::Detached => {
                        return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2536,6 +2261,9 @@ impl Service {
                        )))
                    }
                }
+                shard
+                    .intent
+                    .set_attached(scheduler, Some(migrate_req.node_id));

                tracing::info!("Migrating: new intent {:?}", shard.intent);
                shard.sequence = shard.sequence.next();
@@ -2863,7 +2591,7 @@ impl Service {
                    observed_loc.conf = None;
                }

-                if tenant_state.intent.demote_attached(config_req.node_id) {
+                if tenant_state.intent.notify_offline(config_req.node_id) {
                    tenant_state.sequence = tenant_state.sequence.next();
                    match tenant_state.schedule(scheduler) {
                        Err(e) => {
@@ -2930,9 +2658,6 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
-    ///
-    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
-    /// an attached policy.  We should error out if it isn't.
    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,7 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

 use crate::{metrics, persistence::TenantShardPersistence};
-use pageserver_api::controller_api::NodeAvailability;
+use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -53,11 +53,8 @@ pub(crate) struct TenantState {
    pub(crate) sequence: Sequence,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching.
-    //
-    // None represents an incompletely onboarded tenant via the [`Service::location_config`]
-    // API, where this tenant may only run in PlacementPolicy::Secondary.
-    pub(crate) generation: Option<Generation>,
+    // and use the incremented number when attaching
+    pub(crate) generation: Generation,

    // High level description of how the tenant should be set up.  Provided
    // externally.
@@ -184,13 +181,6 @@ impl IntentState {
        }
    }

-    /// Remove the last secondary node from the list of secondaries
-    pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
-        if let Some(node_id) = self.secondary.pop() {
-            scheduler.node_dec_ref(node_id);
-        }
-    }
-
    pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
        if let Some(old_attached) = self.attached.take() {
            scheduler.node_dec_ref(old_attached);
@@ -218,13 +208,11 @@ impl IntentState {
        &self.secondary
    }

-    /// If the node is in use as the attached location, demote it into
-    /// the list of secondary locations.  This is used when a node goes offline,
-    /// and we want to use a different node for attachment, but not permanently
-    /// forget the location on the offline node.
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
    ///
    /// Returns true if a change was made
-    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
        if self.attached == Some(node_id) {
            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
            // need to call into it here.
@@ -327,7 +315,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) result: Result<(), ReconcileError>,

    pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) generation: Option<Generation>,
+    pub(crate) generation: Generation,
    pub(crate) observed: ObservedState,

    /// Set [`TenantState::pending_compute_notification`] from this flag
@@ -352,7 +340,7 @@ impl TenantState {
            tenant_shard_id,
            policy,
            intent: IntentState::default(),
-            generation: Some(Generation::new(0)),
+            generation: Generation::new(0),
            shard,
            observed: ObservedState::default(),
            config: TenantConfig::default(),
@@ -450,16 +438,10 @@ impl TenantState {
        // more work on the same pageservers we're already using.
        let mut modified = false;

-        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
            Single => {
                // Should have exactly one attached, and zero secondaries
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-
                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;

@@ -469,23 +451,6 @@ impl TenantState {
                }
            }
            Double(secondary_count) => {
-                let retain_secondaries = if self.intent.attached.is_none()
-                    && scheduler.node_preferred(&self.intent.secondary).is_some()
-                {
-                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
-                    // one more secondary than we usually would, as one of them will become attached futher down this function.
-                    secondary_count + 1
-                } else {
-                    secondary_count
-                };
-
-                while self.intent.secondary.len() > retain_secondaries {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-
                // Should have exactly one attached, and N secondaries
                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;
@@ -498,28 +463,15 @@ impl TenantState {
                    modified = true;
                }
            }
-            Secondary => {
-                if let Some(node_id) = self.intent.get_attached() {
-                    // Populate secondary by demoting the attached node
-                    self.intent.demote_attached(*node_id);
-                    modified = true;
-                } else if self.intent.secondary.is_empty() {
-                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[])?;
-                    self.intent.push_secondary(scheduler, node_id);
-                    modified = true;
-                }
-                while self.intent.secondary.len() > 1 {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-            }
            Detached => {
-                // Never add locations in this mode
-                if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
-                    self.intent.clear(scheduler);
+                // Should have no attached or secondary pageservers
+                if self.intent.attached.is_some() {
+                    self.intent.set_attached(scheduler, None);
+                    modified = true;
+                }
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
            }
@@ -566,12 +518,7 @@ impl TenantState {

    fn dirty(&self) -> bool {
        if let Some(node_id) = self.intent.attached {
-            // Maybe panic: it is a severe bug if we try to attach while generation is null.
-            let generation = self
-                .generation
-                .expect("Attempted to enter attached state without a generation");
-
-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -649,10 +596,6 @@ impl TenantState {
        // Reconcile already in flight for the current sequence?
        if let Some(handle) = &self.reconciler {
            if handle.sequence == self.sequence {
-                tracing::info!(
-                    "Reconciliation already in progress for sequence {:?}",
-                    self.sequence,
-                );
                return Some(ReconcilerWaiter {
                    tenant_shard_id: self.tenant_shard_id,
                    seq_wait: self.waiter.clone(),
@@ -672,10 +615,6 @@ impl TenantState {
            return None;
        };

-        // Advance the sequence before spawning a reconciler, so that sequence waiters
-        // can distinguish between before+after the reconcile completes.
-        self.sequence = self.sequence.next();
-
        let reconciler_cancel = cancel.child_token();
        let mut reconciler = Reconciler {
            tenant_shard_id: self.tenant_shard_id,
@@ -777,17 +716,6 @@ impl TenantState {
        })
    }

-    /// Called when a ReconcileResult has been emitted and the service is updating
-    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
-    /// the handle to indicate there is no longer a reconciliation in progress.
-    pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
-        if let Some(reconcile_handle) = &self.reconciler {
-            if reconcile_handle.sequence <= sequence {
-                self.reconciler = None;
-            }
-        }
-    }
-
    // If we had any state at all referring to this node ID, drop it.  Does not
    // attempt to reschedule.
    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -808,8 +736,13 @@ impl TenantState {
            shard_number: self.tenant_shard_id.shard_number.0 as i32,
            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
            shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
-            generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
+            generation: self.generation.into().unwrap_or(0) as i32,
+            generation_pageserver: self
+                .intent
+                .get_attached()
+                .map(|n| n.0 as i64)
+                .unwrap_or(i64::MAX),
+
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
@@ -872,10 +805,8 @@ pub(crate) mod tests {
        assert_ne!(attached_node_id, secondary_node_id);

        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.demote_attached(attached_node_id);
+        let changed = tenant_state.intent.notify_offline(attached_node_id);
        assert!(changed);
-        assert!(tenant_state.intent.attached.is_none());
-        assert_eq!(tenant_state.intent.secondary.len(), 2);

        // Update the scheduler state to indicate the node is offline
        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,12 +2,8 @@ use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
-    controller_api::{
-        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
-    },
    models::{
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
        TimelineCreateRequest, TimelineInfo,
    },
    shard::TenantShardId,
@@ -15,12 +11,12 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr};
+use std::str::FromStr;
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
 use utils::{
-    auth::{encode_from_key_file, Claims, Scope},
+    auth::{Claims, Scope},
    id::{NodeId, TenantId},
 };

@@ -28,7 +24,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: Utf8PathBuf,
-    private_key: Option<Vec<u8>>,
+    jwt_token: Option<String>,
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
@@ -59,6 +55,126 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
@@ -88,11 +204,12 @@ impl AttachmentService {
            .pageservers
            .first()
            .expect("Config is validated to contain at least one pageserver");
-        let (private_key, public_key) = match ps_conf.http_auth_type {
+        let (jwt_token, public_key) = match ps_conf.http_auth_type {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
-                let private_key_path = env.get_private_key_path();
-                let private_key = fs::read(private_key_path).expect("failed to read private key");
+                let jwt_token = env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();

                // If pageserver auth is enabled, this implicitly enables auth for this service,
                // using the same credentials.
@@ -118,7 +235,7 @@ impl AttachmentService {
                } else {
                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
                };
-                (Some(private_key), Some(public_key))
+                (Some(jwt_token), Some(public_key))
            }
        };

@@ -126,7 +243,7 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
-            private_key,
+            jwt_token,
            public_key,
            postgres_port,
            client: reqwest::ClientBuilder::new()
@@ -200,7 +317,7 @@ impl AttachmentService {
                "localhost",
                "-p",
                &format!("{}", self.postgres_port),
-                DB_NAME,
+                &DB_NAME,
            ])
            .output()
            .await
@@ -280,10 +397,7 @@ impl AttachmentService {
        .into_iter()
        .map(|s| s.to_string())
        .collect::<Vec<_>>();
-        if let Some(private_key) = &self.private_key {
-            let claims = Claims::new(None, Scope::PageServerApi);
-            let jwt_token =
-                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
+        if let Some(jwt_token) = &self.jwt_token {
            args.push(format!("--jwt-token={jwt_token}"));
        }

@@ -308,7 +422,7 @@ impl AttachmentService {
            )],
            background_process::InitialPidFile::Create(self.pid_file()),
            || async {
-                match self.ready().await {
+                match self.status().await {
                    Ok(_) => Ok(true),
                    Err(_) => Ok(false),
                }
@@ -354,20 +468,6 @@ impl AttachmentService {
        Ok(())
    }

-    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
-        let category = match path.find('/') {
-            Some(idx) => &path[..idx],
-            None => path,
-        };
-
-        match category {
-            "status" | "ready" => Ok(None),
-            "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
-            "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
-            _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
-        }
-    }
-
    /// Simple HTTP request wrapper for calling into attachment service
    async fn dispatch<RQ, RS>(
        &self,
@@ -393,16 +493,11 @@ impl AttachmentService {
        if let Some(body) = body {
            builder = builder.json(&body)
        }
-        if let Some(private_key) = &self.private_key {
-            println!("Getting claims for path {}", path);
-            if let Some(required_claims) = Self::get_claims_for_path(&path)? {
-                println!("Got claims {:?} for path {}", required_claims, path);
-                let jwt_token = encode_from_key_file(&required_claims, private_key)?;
-                builder = builder.header(
-                    reqwest::header::AUTHORIZATION,
-                    format!("Bearer {jwt_token}"),
-                );
-            }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
        }

        let response = builder.send().await?;
@@ -522,8 +617,8 @@ impl AttachmentService {
    }

    #[instrument(skip(self))]
-    pub async fn ready(&self) -> anyhow::Result<()> {
-        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
+    pub async fn status(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
            .await
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,15 +8,14 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
+use control_plane::attachment_service::{
+    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
-};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -605,7 +605,7 @@ impl Endpoint {
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
        if create_test_user {
-            let conn_str = self.connstr("test", "neondb");
+            let conn_str = self.connstr("user", "neondb");
            println!("Also at '{}'", conn_str);
        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -412,17 +412,14 @@ impl LocalEnv {

    // this function is used only for testing purposes in CLI e g generate tokens during init
    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
-        let private_key_path = self.get_private_key_path();
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
-    }
-
-    pub fn get_private_key_path(&self) -> PathBuf {
-        if self.private_key_path.is_absolute() {
+        let private_key_path = if self.private_key_path.is_absolute() {
            self.private_key_path.to_path_buf()
        } else {
            self.base_data_dir.join(&self.private_key_path)
-        }
+        };
+
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
    }

    //
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,6 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -31,7 +30,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::attachment_service::AttachmentService;
+use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

@@ -116,7 +115,7 @@ impl PageServerNode {
            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
-                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
                    .unwrap();
                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
            }
@@ -353,11 +352,6 @@ impl PageServerNode {
                .remove("compaction_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            compaction_algorithm: settings
-                .remove("compaction_algorithm")
-                .map(serde_json::from_str)
-                .transpose()
-                .context("Failed to parse 'compaction_algorithm' json")?,
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
@@ -461,11 +455,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'compaction_threshold' as an integer")?,
-                compaction_algorithm: settings
-                    .remove("compactin_algorithm")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'compaction_algorithm' json")?,
                gc_horizon: settings
                    .remove("gc_horizon")
                    .map(|x| x.parse::<u64>())
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,9 +70,6 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.

-"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
-
-"admin": Provides access to the control plane and admin APIs of the attachment service.

 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,129 +0,0 @@
-use std::str::FromStr;
-
-/// Request/response types for the storage controller
-/// API (`/control/v1` prefix).  Implemented by the server
-/// in [`attachment_service::http`]
-use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
-
-use crate::{models::ShardParameters, shard::TenantShardId};
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-    pub generation: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponse {
-    pub shards: Vec<TenantCreateResponseShard>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeRegisterRequest {
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeConfigureRequest {
-    pub node_id: NodeId,
-
-    pub availability: Option<NodeAvailability>,
-    pub scheduling: Option<NodeSchedulingPolicy>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantLocateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantLocateResponse {
-    pub shards: Vec<TenantLocateResponseShard>,
-    pub shard_params: ShardParameters,
-}
-
-/// Explicitly migrating a particular shard is a low level operation
-/// TODO: higher level "Reschedule tenant" operation where the request
-/// specifies some constraints, e.g. asking it to get off particular node(s)
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
-    pub node_id: NodeId,
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeAvailability {
-    // Normal, happy state
-    Active,
-    // Offline: Tenants shouldn't try to attach here, but they may assume that their
-    // secondary locations on this node still exist.  Newly added nodes are in this
-    // state until we successfully contact them.
-    Offline,
-}
-
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeSchedulingPolicy {
-    Active,
-    Filling,
-    Pause,
-    Draining,
-}
-
-impl FromStr for NodeSchedulingPolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "filling" => Ok(Self::Filling),
-            "pause" => Ok(Self::Pause),
-            "draining" => Ok(Self::Draining),
-            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
-        }
-    }
-}
-
-impl From<NodeSchedulingPolicy> for String {
-    fn from(value: NodeSchedulingPolicy) -> String {
-        use NodeSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Filling => "filling",
-            Pause => "pause",
-            Draining => "draining",
-        }
-        .to_string()
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateResponse {}
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,7 +307,6 @@ impl KeySpaceRandomAccum {
    }
 }

-#[inline(always)]
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
    let start = key_range.start;
    let end = key_range.end;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,14 +2,13 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

-pub mod controller_api;
+/// Public API types
+pub mod control_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
-/// Public API types
-pub mod upcall_api;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -14,6 +14,7 @@ use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
+use strum_macros;
 use utils::{
    completion,
    history_buffer::HistoryBufferWithDropCounter,
@@ -271,8 +272,6 @@ pub struct TenantConfig {
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
-    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -307,13 +306,6 @@ impl EvictionPolicy {
    }
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
-pub enum CompactionAlgorithm {
-    Legacy,
-    Tiered,
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -1076,6 +1068,7 @@ impl PagestreamBeMessage {

 #[cfg(test)]
 mod tests {
+    use bytes::Buf;
    use serde_json::json;

    use super::*;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
+use thiserror;
 use utils::id::TenantId;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -655,7 +656,10 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke

 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};

    use super::*;

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -623,7 +623,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use bytes::Bytes;
    use camino_tempfile::tempdir;
+    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

    async fn read_and_check_metadata(
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1040,7 +1040,7 @@ mod tests {
            Some("test/prefix/"),
            Some("/test/prefix/"),
        ];
-        let expected_outputs = [
+        let expected_outputs = vec![
            vec!["", "some/path", "some/path"],
            vec!["/", "/some/path", "/some/path"],
            vec![
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,6 +1,7 @@
 // For details about authentication see docs/authentication.md

 use arc_swap::ArcSwap;
+use serde;
 use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
@@ -31,8 +32,6 @@ pub enum Scope {
    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    // Allows access to control plane managment API and some storage controller endpoints.
-    Admin,
 }

 /// JWT payload. See docs/authentication.md for the format
@@ -205,11 +204,12 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        //   "scope": "tenant",
        //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
        //   "iss": "neon.controlplane",
+        //   "exp": 1709200879,
        //   "iat": 1678442479
        // }
        // ```
        //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -4,9 +4,7 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion {
-    _token: TaskTrackerToken,
-}
+pub struct Completion(TaskTrackerToken);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -51,5 +49,5 @@ pub fn channel() -> (Completion, Barrier) {
    tracker.close();

    let token = tracker.token();
-    (Completion { _token: token }, Barrier(tracker))
+    (Completion(token), Barrier(tracker))
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -45,7 +45,7 @@ impl Generation {
        Self::Broken
    }

-    pub const fn new(v: u32) -> Self {
+    pub fn new(v: u32) -> Self {
        Self::Valid(v)
    }

--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tracing::{debug, info, info_span, warn, Instrument};
+use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
 use std::str::FromStr;
@@ -156,10 +156,6 @@ pub struct ChannelWriter {
    buffer: BytesMut,
    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
    written: usize,
-    /// Time spent waiting for the channel to make progress. It is not the same as time to upload a
-    /// buffer because we cannot know anything about that, but this should allow us to understand
-    /// the actual time taken without the time spent `std::thread::park`ed.
-    wait_time: std::time::Duration,
 }

 impl ChannelWriter {
@@ -172,7 +168,6 @@ impl ChannelWriter {
            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
            tx,
            written: 0,
-            wait_time: std::time::Duration::ZERO,
        }
    }

@@ -185,8 +180,6 @@ impl ChannelWriter {
        tracing::trace!(n, "flushing");
        let ready = self.buffer.split().freeze();

-        let wait_started_at = std::time::Instant::now();
-
        // not ideal to call from blocking code to block_on, but we are sure that this
        // operation does not spawn_blocking other tasks
        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -199,9 +192,6 @@ impl ChannelWriter {
            // sending it to the client.
            Ok(())
        });
-
-        self.wait_time += wait_started_at.elapsed();
-
        if res.is_err() {
            return Err(std::io::ErrorKind::BrokenPipe.into());
        }
@@ -212,10 +202,6 @@ impl ChannelWriter {
    pub fn flushed_bytes(&self) -> usize {
        self.written
    }
-
-    pub fn wait_time(&self) -> std::time::Duration {
-        self.wait_time
-    }
 }

 impl std::io::Write for ChannelWriter {
@@ -266,52 +252,22 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body

    let span = info_span!("blocking");
    tokio::task::spawn_blocking(move || {
-        // there are situations where we lose scraped metrics under load, try to gather some clues
-        // since all nodes are queried this, keep the message count low.
-        let spawned_at = std::time::Instant::now();
-
        let _span = span.entered();
-
        let metrics = metrics::gather();
-
-        let gathered_at = std::time::Instant::now();
-
        let res = encoder
            .encode(&metrics, &mut writer)
            .and_then(|_| writer.flush().map_err(|e| e.into()));

-        // this instant is not when we finally got the full response sent, sending is done by hyper
-        // in another task.
-        let encoded_at = std::time::Instant::now();
-
-        let spawned_in = spawned_at - started_at;
-        let collected_in = gathered_at - spawned_at;
-        // remove the wait time here in case the tcp connection was clogged
-        let encoded_in = encoded_at - gathered_at - writer.wait_time();
-        let total = encoded_at - started_at;
-
        match res {
            Ok(()) => {
                tracing::info!(
                    bytes = writer.flushed_bytes(),
-                    total_ms = total.as_millis(),
-                    spawning_ms = spawned_in.as_millis(),
-                    collection_ms = collected_in.as_millis(),
-                    encoding_ms = encoded_in.as_millis(),
+                    elapsed_ms = started_at.elapsed().as_millis(),
                    "responded /metrics"
                );
            }
            Err(e) => {
-                // there is a chance that this error is not the BrokenPipe we generate in the writer
-                // for "closed connection", but it is highly unlikely.
-                tracing::warn!(
-                    after_bytes = writer.flushed_bytes(),
-                    total_ms = total.as_millis(),
-                    spawning_ms = spawned_in.as_millis(),
-                    collection_ms = collected_in.as_millis(),
-                    encoding_ms = encoded_in.as_millis(),
-                    "failed to write out /metrics response: {e:?}"
-                );
+                tracing::warn!("failed to write out /metrics response: {e:#}");
                // semantics of this error are quite... unclear. we want to error the stream out to
                // abort the response to somehow notify the client that we failed.
                //
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,6 +415,7 @@ mod tests {

    use super::*;

+    use serde::ser::Serialize;
    use serde_assert::{Deserializer, Serializer, Token, Tokens};

    #[test]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]

-use std::cmp::{Eq, Ordering};
+use std::cmp::{Eq, Ordering, PartialOrd};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
@@ -249,6 +249,7 @@ where
 mod tests {
    use super::*;
    use std::sync::Arc;
+    use std::time::Duration;

    impl MonotonicCounter<i32> for i32 {
        fn cnt_advance(&mut self, val: i32) {
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -221,7 +221,7 @@ impl RcuWaitList {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use std::sync::Mutex;
+    use std::sync::{Arc, Mutex};
    use std::time::Duration;

    #[tokio::test]
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -239,6 +239,7 @@ mod tests {
    use std::{
        convert::Infallible,
        pin::{pin, Pin},
+        sync::atomic::{AtomicUsize, Ordering},
        time::Duration,
    };

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -73,7 +73,6 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
-pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -1,54 +0,0 @@
-[package]
-name = "pageserver_compaction"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[features]
-default = []
-
-[dependencies]
-anyhow.workspace = true
-async-compression.workspace = true
-async-stream.workspace = true
-async-trait.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
-clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
-futures.workspace = true
-git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
-itertools.workspace = true
-once_cell.workspace = true
-pageserver_api.workspace = true
-pin-project-lite.workspace = true
-rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
-tracing.workspace = true
-tracing-error.workspace = true
-tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
-[dev-dependencies]
-criterion.workspace = true
-hex-literal.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
--- a/pageserver/compaction/TODO.md
+++ b/pageserver/compaction/TODO.md
@@ -1,51 +0,0 @@
-# TODO
-
- If the key space can be perfectly partitioned at some key, perform planning on each
-  partition separately. For example, if we are compacting a level with layers like this:
-
-  ```
-              :
-  +--+ +----+ :  +------+
-  |  | |    | :  |      |
-  +--+ +----+ :  +------+
-              :
-  +-----+ +-+ : +--------+
-  |     | | | : |        |
-  +-----+ +-+ : +--------+
-              :
-  ```
-
-  At the dotted line, there is a natural split in the key space, such that all
-  layers are either on the left or the right of it. We can compact the
-  partitions separately.  We could choose to create image layers for one
-  partition but not the other one, for example.
-
- All the layers don't have to be exactly the same size, we can choose to cut a
-  layer short or stretch it a little larger than the target size, if it helps
-  the overall system. We can help perfect partitions (see previous bullet point)
-  to happen more frequently, by choosing the cut points wisely. For example, try
-  to cut layers at boundaries of underlying image layers. And "snap to grid",
-  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
-
- Avoid rewriting layers when we'd just create an identical layer to an input
-  layer.
-
- Parallelism. The code is already split up into planning and execution, so that
-  we first split up the compaction work into "Jobs", and then execute them.
-  It would be straightforward to execute multiple jobs in parallel.
-
- Materialize extra pages in delta layers during compaction. This would reduce
-  read amplification. There has been the idea of partial image layers. Materializing
-  extra pages in the delta layers achieve the same goal, without introducing a new
-  concept.
-
-## Simulator
-
- Expand the simulator for more workloads
- Automate a test suite that runs the simluator with different workloads and
-  spits out a table of results
- Model read amplification
- More sanity checking. One idea is to keep a reference count of each
-  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
-  a MockRecord that is newer than PITR horizon is completely dropped. That would
-  indicate that the record was lost.
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,214 +0,0 @@
-use clap::{Parser, Subcommand};
-use pageserver_compaction::simulator::MockTimeline;
-use rand::Rng;
-use std::io::Write;
-use std::path::{Path, PathBuf};
-use std::sync::OnceLock;
-
-use utils::project_git_version;
-
-project_git_version!(GIT_VERSION);
-
-#[derive(Parser)]
-#[command(
-    version = GIT_VERSION,
-    about = "Neon Pageserver compaction simulator",
-    long_about = "A developer tool to visualize and test compaction"
-)]
-#[command(propagate_version = true)]
-struct CliOpts {
-    #[command(subcommand)]
-    command: Commands,
-}
-
-#[derive(Subcommand)]
-enum Commands {
-    RunSuite,
-    Simulate(SimulateCmd),
-}
-
-#[derive(Clone, clap::ValueEnum)]
-enum Distribution {
-    Uniform,
-    HotCold,
-}
-
-/// Read and update pageserver metadata file
-#[derive(Parser)]
-struct SimulateCmd {
-    distribution: Distribution,
-
-    /// Number of records to digest
-    num_records: u64,
-    /// Record length
-    record_len: u64,
-
-    // Logical database size in MB
-    logical_size: u64,
-}
-
-async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
-    let mut executor = MockTimeline::new();
-
-    // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
-    //let key_range = u64::MIN..u64::MAX;
-    println!(
-        "starting simulation with key range {:016X}-{:016X}",
-        key_range.start, key_range.end
-    );
-
-    // helper function to print progress indicator
-    let print_progress = |i| -> anyhow::Result<()> {
-        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
-            print!(
-                "\ringested {} / {} records, {} MiB / {} MiB...",
-                i + 1,
-                cmd.num_records,
-                (i + 1) * cmd.record_len / (1_000_000),
-                cmd.num_records * cmd.record_len / (1_000_000),
-            );
-            std::io::stdout().flush()?;
-        }
-        Ok(())
-    };
-
-    match cmd.distribution {
-        Distribution::Uniform => {
-            for i in 0..cmd.num_records {
-                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-        Distribution::HotCold => {
-            let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
-            let hot_key_range = 0..splitpoint;
-            let cold_key_range = splitpoint..key_range.end;
-
-            for i in 0..cmd.num_records {
-                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
-                    &hot_key_range
-                } else {
-                    &cold_key_range
-                };
-                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-    }
-    println!("done!");
-    executor.flush_l0();
-    executor.compact_if_needed().await?;
-    let stats = executor.stats()?;
-
-    // Print the stats to stdout, and also to a file
-    print!("{stats}");
-    std::fs::write(results_path.join("stats.txt"), stats)?;
-
-    let animation_path = results_path.join("compaction-animation.html");
-    executor.draw_history(std::fs::File::create(&animation_path)?)?;
-    println!(
-        "animation: file://{}",
-        animation_path.canonicalize()?.display()
-    );
-
-    Ok(())
-}
-
-async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
-    std::fs::create_dir(results_path)?;
-
-    set_log_file(File::create(results_path.join("log"))?);
-    let result = simulate(workload, results_path).await;
-    set_log_stdout();
-    result
-}
-
-async fn run_suite() -> anyhow::Result<()> {
-    let top_results_path = PathBuf::from(format!(
-        "compaction-suite-results.{}",
-        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
-    ));
-    std::fs::create_dir(&top_results_path)?;
-
-    let workload = SimulateCmd {
-        distribution: Distribution::Uniform,
-        // Generate 20 GB of WAL
-        record_len: 1_000,
-        num_records: 20_000_000,
-        // Logical size 5 GB
-        logical_size: 5_000,
-    };
-
-    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
-
-    println!(
-        "All tests finished. Results in {}",
-        top_results_path.display()
-    );
-    Ok(())
-}
-
-use std::fs::File;
-use std::io::Stdout;
-use std::sync::Mutex;
-use tracing_subscriber::fmt::writer::EitherWriter;
-use tracing_subscriber::fmt::MakeWriter;
-
-static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
-fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
-    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
-}
-
-fn set_log_file(f: File) {
-    *get_log_output().lock().unwrap() = EitherWriter::A(f);
-}
-
-fn set_log_stdout() {
-    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
-}
-
-fn init_logging() -> anyhow::Result<()> {
-    // We fall back to printing all spans at info-level or above if
-    // the RUST_LOG environment variable is not set.
-    let rust_log_env_filter = || {
-        tracing_subscriber::EnvFilter::try_from_default_env()
-            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
-    };
-
-    // NB: the order of the with() calls does not matter.
-    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
-    use tracing_subscriber::prelude::*;
-    tracing_subscriber::registry()
-        .with({
-            let log_layer = tracing_subscriber::fmt::layer()
-                .with_target(false)
-                .with_ansi(false)
-                .with_writer(|| get_log_output().make_writer());
-            log_layer.with_filter(rust_log_env_filter())
-        })
-        .init();
-
-    Ok(())
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = CliOpts::parse();
-
-    init_logging()?;
-
-    match cli.command {
-        Commands::Simulate(cmd) => {
-            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
-        }
-        Commands::RunSuite => {
-            run_suite().await?;
-        }
-    };
-    Ok(())
-}
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -1,866 +0,0 @@
-//! # Tiered compaction algorithm.
-//!
-//! Read all the input delta files, and write a new set of delta files that
-//! include all the input WAL records. See retile_deltas().
-//!
-//! In a "normal" LSM tree, you get to remove any values that are overwritten by
-//! later values, but in our system, we keep all the history. So the reshuffling
-//! doesn't remove any garbage, it just reshuffles the records to reduce read
-//! amplification, i.e. the number of files that you need to access to find the
-//! WAL records for a given key.
-//!
-//! If the new delta files would be very "narrow", i.e. each file would cover
-//! only a narrow key range, then we create a new set of image files
-//! instead. The current threshold is that if the estimated total size of the
-//! image layers is smaller than the size of the deltas, then we create image
-//! layers. That amounts to 2x storage amplification, and it means that the
-//! distance of image layers in LSN dimension is roughly equal to the logical
-//! database size. For example, if the logical database size is 10 GB, we would
-//! generate new image layers every 10 GB of WAL.
-use futures::StreamExt;
-use tracing::{debug, info};
-
-use std::collections::{HashSet, VecDeque};
-use std::ops::Range;
-
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
-use crate::interface::*;
-use utils::lsn::Lsn;
-
-use crate::identify_levels::identify_level;
-
-/// Main entry point to compaction.
-///
-/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
-/// everything below that point, that needs compaction. The cutoff LSN must
-/// partition the layers so that there are no layers that span across that
-/// LSN. To start compaction at the top of the tree, pass the end LSN of the
-/// written last L0 layer.
-pub async fn compact_tiered<E: CompactionJobExecutor>(
-    executor: &mut E,
-    end_lsn: Lsn,
-    target_file_size: u64,
-    fanout: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
-    // Start at L0
-    let mut current_level_no = 0;
-    let mut current_level_target_height = target_file_size;
-    loop {
-        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
-        let all_layers = executor
-            .get_layers(
-                &(E::Key::MIN..E::Key::MAX),
-                &(Lsn(u64::MIN)..end_lsn + 1),
-                ctx,
-            )
-            .await?;
-        info!(
-            "Compacting L{}, total # of layers: {}",
-            current_level_no,
-            all_layers.len()
-        );
-
-        // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level span an LSN range up to 1.75x target file
-        // size. That should give us enough slop that if we created a slightly
-        // oversized L0 layer, e.g. because flushing the in-memory layer was
-        // delayed for some reason, we don't consider the oversized layer to
-        // belong to L1. But not too much slop, that we don't accidentally
-        // "skip" levels.
-        let max_height = (current_level_target_height as f64 * 1.75) as u64;
-        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
-            break;
-        };
-
-        // Calculate the height of this level. If the # of tiers exceeds the
-        // fanout parameter, it's time to compact it.
-        let depth = level.depth();
-        info!(
-            "Level {} identified as LSN range {}-{}: depth {}",
-            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
-        );
-        for l in &level.layers {
-            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
-        }
-        if depth < fanout {
-            debug!(
-                level = current_level_no,
-                depth = depth,
-                fanout,
-                "too few deltas to compact"
-            );
-            break;
-        }
-
-        compact_level(
-            &level.lsn_range,
-            &level.layers,
-            executor,
-            target_file_size,
-            ctx,
-        )
-        .await?;
-        if target_file_size == u64::MAX {
-            break;
-        }
-        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
-    }
-    Ok(())
-}
-
-async fn compact_level<E: CompactionJobExecutor>(
-    lsn_range: &Range<Lsn>,
-    layers: &[E::Layer],
-    executor: &mut E,
-    target_file_size: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<bool> {
-    let mut layer_fragments = Vec::new();
-    for l in layers {
-        layer_fragments.push(LayerFragment::new(l.clone()));
-    }
-
-    let mut state = LevelCompactionState {
-        target_file_size,
-        _lsn_range: lsn_range.clone(),
-        layers: layer_fragments,
-        jobs: Vec::new(),
-        job_queue: Vec::new(),
-        next_level: false,
-        executor,
-    };
-
-    let first_job = CompactionJob {
-        key_range: E::Key::MIN..E::Key::MAX,
-        lsn_range: lsn_range.clone(),
-        strategy: CompactionStrategy::Divide,
-        input_layers: state
-            .layers
-            .iter()
-            .enumerate()
-            .map(|i| LayerId(i.0))
-            .collect(),
-        completed: false,
-    };
-
-    state.jobs.push(first_job);
-    state.job_queue.push(JobId(0));
-    state.execute(ctx).await?;
-
-    info!(
-        "compaction completed! Need to process next level: {}",
-        state.next_level
-    );
-
-    Ok(state.next_level)
-}
-
-/// Blackboard that keeps track of the state of all the jobs and work remaining
-struct LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    // parameters
-    target_file_size: u64,
-
-    _lsn_range: Range<Lsn>,
-    layers: Vec<LayerFragment<E>>,
-
-    // job queue
-    jobs: Vec<CompactionJob<E>>,
-    job_queue: Vec<JobId>,
-
-    /// If false, no need to compact levels below this
-    next_level: bool,
-
-    /// Interface to the outside world
-    executor: &'a mut E,
-}
-
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct LayerId(usize);
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct JobId(usize);
-
-struct PendingJobSet {
-    pending: HashSet<JobId>,
-    completed: HashSet<JobId>,
-}
-
-impl PendingJobSet {
-    fn new() -> Self {
-        PendingJobSet {
-            pending: HashSet::new(),
-            completed: HashSet::new(),
-        }
-    }
-
-    fn complete_job(&mut self, job_id: JobId) {
-        self.pending.remove(&job_id);
-        self.completed.insert(job_id);
-    }
-
-    fn all_completed(&self) -> bool {
-        self.pending.is_empty()
-    }
-}
-
-// When we decide to rewrite a set of layers, LayerFragment is used to keep
-// track which new layers supersede an old layer. When all the stakeholder jobs
-// have completed, this layer can be deleted.
-struct LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    layer: E::Layer,
-
-    // If we will write new layers to replace this one, this keeps track of the
-    // jobs that need to complete before this layer can be deleted. As the jobs
-    // complete, they are moved from 'pending' to 'completed' set. Once the
-    // 'pending' set becomes empty, the layer can be deleted.
-    //
-    // If None, this layer is not rewritten and must not be deleted.
-    deletable_after: Option<PendingJobSet>,
-
-    deleted: bool,
-}
-
-impl<E> LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    fn new(layer: E::Layer) -> Self {
-        LayerFragment {
-            layer,
-            deletable_after: None,
-            deleted: false,
-        }
-    }
-}
-
-#[derive(PartialEq)]
-enum CompactionStrategy {
-    Divide,
-    CreateDelta,
-    CreateImage,
-}
-
-#[allow(dead_code)] // Todo
-struct CompactionJob<E: CompactionJobExecutor> {
-    key_range: Range<E::Key>,
-    lsn_range: Range<Lsn>,
-
-    strategy: CompactionStrategy,
-
-    input_layers: Vec<LayerId>,
-
-    completed: bool,
-}
-
-impl<'a, E> LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    /// Main loop of the executor.
-    ///
-    /// In each iteration, we take the next job from the queue, and execute it.
-    /// The execution might add new jobs to the queue. Keep going until the
-    /// queue is empty.
-    ///
-    /// Initially, the job queue consists of one Divide job over the whole
-    /// level. On first call, it is divided into smaller jobs.
-    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
-        while let Some(next_job_id) = self.job_queue.pop() {
-            info!("executing job {}", next_job_id.0);
-            self.execute_job(next_job_id, ctx).await?;
-        }
-
-        // all done!
-        Ok(())
-    }
-
-    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        match job.strategy {
-            CompactionStrategy::Divide => {
-                self.divide_job(job_id, ctx).await?;
-                Ok(())
-            }
-            CompactionStrategy::CreateDelta => {
-                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-                let mut layer_ids: Vec<LayerId> = Vec::new();
-                for layer_id in &job.input_layers {
-                    let layer = &self.layers[layer_id.0].layer;
-                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
-                        deltas.push(dl.clone());
-                        layer_ids.push(*layer_id);
-                    }
-                }
-
-                self.executor
-                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // did we complete any fragments?
-                for layer_id in layer_ids {
-                    let l = &mut self.layers[layer_id.0];
-                    if let Some(deletable_after) = l.deletable_after.as_mut() {
-                        deletable_after.complete_job(job_id);
-                        if deletable_after.all_completed() {
-                            self.executor.delete_layer(&l.layer, ctx).await?;
-                            l.deleted = true;
-                        }
-                    }
-                }
-
-                self.next_level = true;
-
-                Ok(())
-            }
-            CompactionStrategy::CreateImage => {
-                self.executor
-                    .create_image(job.lsn_range.end, &job.key_range, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // TODO: we could check if any layers < PITR horizon became deletable
-                Ok(())
-            }
-        }
-    }
-
-    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
-        let job_id = JobId(self.jobs.len());
-        self.jobs.push(job);
-        self.job_queue.push(job_id);
-        job_id
-    }
-
-    /// Take a partition of the key space, and decide how to compact it.
-    ///
-    /// TODO: Currently, this is called exactly once for the level, and we
-    /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of delta. In the future, this should try to partition
-    /// the key space, and make the decision separately for each partition.
-    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Check for dummy cases
-        if job.input_layers.is_empty() {
-            return Ok(());
-        }
-
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Would it be better to create images for this partition?
-        // Decide based on the average density of the level
-        let keyspace_size = keyspace_total_size(
-            &self
-                .executor
-                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-                .await?,
-        ) * 8192;
-
-        let wal_size = job
-            .input_layers
-            .iter()
-            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
-            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
-            .sum::<u64>();
-        if keyspace_size < wal_size {
-            // seems worth it
-            info!(
-                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
-                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
-            );
-            self.cover_with_images(job_id, ctx).await
-        } else {
-            // do deltas
-            info!(
-                "coverage not worth it, keyspace_size {}, wal_size {}",
-                keyspace_size, wal_size
-            );
-            self.retile_deltas(job_id, ctx).await
-        }
-    }
-
-    // LSN
-    //  ^
-    //  |
-    //  |                          ###|###|#####
-    //  | +--+-----+--+            +--+-----+--+
-    //  | |  |     |  |            |  |     |  |
-    //  | +--+--+--+--+            +--+--+--+--+
-    //  | |     |     |            |     |     |
-    //  | +---+-+-+---+     ==>    +---+-+-+---+
-    //  | |   |   |   |            |   |   |   |
-    //  | +---+-+-++--+            +---+-+-++--+
-    //  | |     |  |  |            |     |  |  |
-    //  | +-----+--+--+            +-----+--+--+
-    //  |
-    //  +--------------> key
-    //
-    async fn cover_with_images(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // XXX: do we still need the "holes" stuff?
-
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let keyspace = self
-            .executor
-            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-            .await?;
-
-        let mut window = KeyspaceWindow::new(
-            E::Key::MIN..E::Key::MAX,
-            keyspace,
-            self.target_file_size / 8192,
-        );
-        while let Some(key_range) = window.choose_next_image() {
-            new_jobs.push(CompactionJob::<E> {
-                key_range,
-                lsn_range: job.lsn_range.clone(),
-                strategy: CompactionStrategy::CreateImage,
-                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
-                completed: false,
-            });
-        }
-
-        for j in new_jobs.into_iter().rev() {
-            let _job_id = self.push_job(j);
-
-            // TODO: image layers don't let us delete anything. unless < PITR horizon
-            //let j = &self.jobs[job_id.0];
-            // for layer_id in j.input_layers.iter() {
-            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
-            //}
-        }
-
-        Ok(())
-    }
-
-    // Merge the contents of all the input delta layers into a new set
-    // of delta layers, based on the current partitioning.
-    //
-    // We split the new delta layers on the key dimension. We iterate through
-    // the key space, and for each key, check if including the next key to the
-    // current output layer we're building would cause the layer to become too
-    // large. If so, dump the current output layer and start new one.  It's
-    // possible that there is a single key with so many page versions that
-    // storing all of them in a single layer file would be too large. In that
-    // case, we also split on the LSN dimension.
-    //
-    // LSN
-    //  ^
-    //  |
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    //
-    // If one key (X) has a lot of page versions:
-    //
-    // LSN
-    //  ^
-    //  |                                 (X)
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  +--+  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  +--+  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    // TODO: this actually divides the layers into fixed-size chunks, not
-    // based on the partitioning.
-    //
-    // TODO: we should also opportunistically materialize and
-    // garbage collect what we can.
-    async fn retile_deltas(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Sweep the key space left to right, running an estimate of how much
-        // disk size and keyspace we have accumulated
-        //
-        // Once the disk size reaches the target threshold, stop and think.
-        // If we have accumulated only a narrow band of keyspace, create an
-        // image layer. Otherwise write a delta layer.
-
-        // FIXME: deal with the case of lots of values for same key
-
-        // FIXME: we are ignoring images here. Did we already divide the work
-        // so that we won't encounter them here?
-
-        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-        for layer_id in &job.input_layers {
-            let l = &self.layers[layer_id.0];
-            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
-                deltas.push(dl.clone());
-            }
-        }
-        // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
-        let mut all_in_window: bool = false;
-        let mut window = Window::new();
-        loop {
-            if all_in_window && window.elems.is_empty() {
-                // All done!
-                break;
-            }
-            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
-            {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
-                    all_in_window = true;
-                }
-            }
-        }
-
-        // All the input files are rewritten. Set up the tracking for when they can
-        // be deleted.
-        for layer_id in job.input_layers.iter() {
-            let l = &mut self.layers[layer_id.0];
-            assert!(l.deletable_after.is_none());
-            l.deletable_after = Some(PendingJobSet::new());
-        }
-        for j in new_jobs.into_iter().rev() {
-            let job_id = self.push_job(j);
-            let j = &self.jobs[job_id.0];
-            for layer_id in j.input_layers.iter() {
-                self.layers[layer_id.0]
-                    .deletable_after
-                    .as_mut()
-                    .unwrap()
-                    .pending
-                    .insert(job_id);
-            }
-        }
-
-        Ok(())
-    }
-}
-
-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
-struct KeyspaceWindow<K> {
-    head: KeyspaceWindowHead<K>,
-
-    start_pos: KeyspaceWindowPos<K>,
-}
-struct KeyspaceWindowHead<K> {
-    // overall key range to cover
-    key_range: Range<K>,
-
-    keyspace: Vec<Range<K>>,
-    target_keysize: u64,
-}
-
-#[derive(Clone)]
-struct KeyspaceWindowPos<K> {
-    end_key: K,
-
-    keyspace_idx: usize,
-
-    accum_keysize: u64,
-}
-impl<K: CompactionKey> KeyspaceWindowPos<K> {
-    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
-        self.keyspace_idx == w.keyspace.len()
-    }
-
-    // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
-        while self.accum_keysize < max_size && !self.reached_end(w) {
-            let curr_range = &w.keyspace[self.keyspace_idx];
-            if self.end_key < curr_range.start {
-                // skip over any unused space
-                self.end_key = curr_range.start;
-            }
-
-            // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
-            if (self.accum_keysize + distance as u64) < max_size {
-                // oh yeah, it fits
-                self.end_key = curr_range.end;
-                self.keyspace_idx += 1;
-                self.accum_keysize += distance as u64;
-            } else {
-                // advance within the range
-                let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
-                if (self.accum_keysize + distance as u64) < max_size {
-                    self.end_key = skip_key;
-                    self.accum_keysize += distance as u64;
-                } else {
-                    self.end_key = self.end_key.next();
-                    self.accum_keysize += 1;
-                }
-            }
-        }
-    }
-}
-
-impl<K> KeyspaceWindow<K>
-where
-    K: CompactionKey,
-{
-    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
-        assert!(keyspace.first().unwrap().start >= key_range.start);
-
-        let start_key = key_range.start;
-        let start_pos = KeyspaceWindowPos::<K> {
-            end_key: start_key,
-            keyspace_idx: 0,
-            accum_keysize: 0,
-        };
-        Self {
-            head: KeyspaceWindowHead::<K> {
-                key_range,
-                keyspace,
-                target_keysize,
-            },
-            start_pos,
-        }
-    }
-
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
-        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
-            // we've reached the end
-            return None;
-        }
-
-        let mut next_pos = self.start_pos.clone();
-        next_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + self.head.target_keysize,
-        );
-
-        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
-        // 1.25x target size
-        let mut end_pos = next_pos.clone();
-        end_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
-        );
-        if end_pos.reached_end(&self.head) {
-            // gobble up any unused keyspace between the last used key and end of the range
-            assert!(end_pos.end_key <= self.head.key_range.end);
-            end_pos.end_key = self.head.key_range.end;
-            next_pos = end_pos;
-        }
-
-        let start_key = self.start_pos.end_key;
-        self.start_pos = next_pos;
-        Some(start_key..self.start_pos.end_key)
-    }
-}
-
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-//
-// Candidates:
-//
-// 1. Create an image layer, snapping to previous images
-// 2. Create a delta layer, snapping to previous images
-// 3. Create an image layer, snapping to
-//
-//
-
-// Take previous partitioning, based on the image layers below.
-//
-// Candidate is at the front:
-//
-// Consider stretching an image layer to next divider? If it's close enough,
-// that's the image candidate
-//
-// If it's too far, consider splitting at a reasonable point
-//
-// Is the image candidate smaller than the equivalent delta? If so,
-// split off the image. Otherwise, split off one delta.
-// Try to snap off the delta at a reasonable point
-
-struct WindowElement<K> {
-    start_key: K, // inclusive
-    last_key: K,  // inclusive
-    accum_size: u64,
-}
-struct Window<K> {
-    elems: VecDeque<WindowElement<K>>,
-
-    // last key that was split off, inclusive
-    splitoff_key: Option<K>,
-    splitoff_size: u64,
-}
-
-impl<K> Window<K>
-where
-    K: CompactionKey,
-{
-    fn new() -> Self {
-        Self {
-            elems: VecDeque::new(),
-            splitoff_key: None,
-            splitoff_size: 0,
-        }
-    }
-
-    fn feed(&mut self, key: K, size: u64) {
-        let last_size;
-        if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
-            last_size = last.accum_size;
-        } else {
-            last_size = 0;
-        }
-        // This is a new key.
-        let elem = WindowElement {
-            start_key: key,
-            last_key: key,
-            accum_size: last_size + size,
-        };
-        self.elems.push_back(elem);
-    }
-
-    fn remain_size(&self) -> u64 {
-        self.elems.back().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn peek_size(&self) -> u64 {
-        self.elems.front().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn commit_upto(&mut self, mut upto: usize) {
-        while upto > 1 {
-            let popped = self.elems.pop_front().unwrap();
-            self.elems.front_mut().unwrap().start_key = popped.start_key;
-            upto -= 1;
-        }
-    }
-
-    fn find_size_split(&self, target_size: u64) -> usize {
-        self.elems
-            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
-    }
-
-    fn pop(&mut self) {
-        let first = self.elems.pop_front().unwrap();
-        self.splitoff_size = first.accum_size;
-
-        self.splitoff_key = Some(first.last_key);
-    }
-
-    // the difference between delta and image is that an image covers
-    // any unused keyspace before and after, while a delta tries to
-    // minimize that. TODO: difference not implemented
-    fn pop_delta(&mut self) -> Range<K> {
-        let first = self.elems.front().unwrap();
-        let key_range = first.start_key..first.last_key.next();
-
-        self.pop();
-        key_range
-    }
-
-    // Prerequisite: we have enough input in the window
-    //
-    // On return None, the caller should feed more data and call again
-    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
-        if has_more && self.elems.is_empty() {
-            // Starting up
-            return None;
-        }
-
-        // If we still have an undersized candidate, just keep going
-        while self.peek_size() < target_size {
-            if self.elems.len() > 1 {
-                self.commit_upto(2);
-            } else if has_more {
-                return None;
-            } else {
-                break;
-            }
-        }
-
-        // Ensure we have enough input in the window to make a good decision
-        if has_more && self.remain_size() < target_size * 5 / 4 {
-            return None;
-        }
-
-        // The candidate on the front is now large enough, for a delta.
-        // And we have enough data in the window to decide.
-
-        // If we're willing to stretch it up to 1.25 target size, could we
-        // gobble up the rest of the work? This avoids creating very small
-        // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
-            self.commit_upto(self.elems.len());
-        } else {
-            let delta_split_at = self.find_size_split(target_size);
-            self.commit_upto(delta_split_at);
-
-            // If it's still not large enough, request the caller to fill the window
-            if self.elems.len() == 1 && has_more {
-                return None;
-            }
-        }
-        Some(self.pop_delta())
-    }
-}
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -1,242 +0,0 @@
-//! This file contains generic utility functions over the interface types,
-//! which could be handy for any compaction implementation.
-use crate::interface::*;
-
-use futures::future::BoxFuture;
-use futures::{Stream, StreamExt};
-use itertools::Itertools;
-use pin_project_lite::pin_project;
-use std::collections::BinaryHeap;
-use std::collections::VecDeque;
-use std::future::Future;
-use std::ops::{DerefMut, Range};
-use std::pin::Pin;
-use std::task::{ready, Poll};
-
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
-where
-    K: CompactionKey,
-{
-    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
-}
-
-pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-    !(a.end <= b.start || b.end <= a.start)
-}
-
-pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
-    let x = std::mem::take(a);
-    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
-        .into_iter()
-        .kmerge_by(|a, b| a.start < b.start);
-    let mut ranges = Vec::new();
-    if let Some(first) = all_ranges_iter.next() {
-        let (mut start, mut end) = (first.start, first.end);
-
-        for r in all_ranges_iter {
-            assert!(r.start >= start);
-            if r.start > end {
-                ranges.push(start..end);
-                start = r.start;
-                end = r.end;
-            } else if r.end > end {
-                end = r.end;
-            }
-        }
-        ranges.push(start..end);
-    }
-    *a = ranges
-}
-
-pub fn intersect_keyspace<K: Ord + Clone + Copy>(
-    a: &CompactionKeySpace<K>,
-    r: &Range<K>,
-) -> CompactionKeySpace<K> {
-    let mut ranges: Vec<Range<K>> = Vec::new();
-
-    for x in a.iter() {
-        if x.end <= r.start {
-            continue;
-        }
-        if x.start >= r.end {
-            break;
-        }
-        ranges.push(x.clone())
-    }
-
-    // trim the ends
-    if let Some(first) = ranges.first_mut() {
-        first.start = std::cmp::max(first.start, r.start);
-    }
-    if let Some(last) = ranges.last_mut() {
-        last.end = std::cmp::min(last.end, r.end);
-    }
-    ranges
-}
-
-/// Create a stream that iterates through all DeltaEntrys among all input
-/// layers, in key-lsn order.
-///
-/// This is public because the create_delta() implementation likely wants to use this too
-/// TODO: move to a more shared place
-pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
-    layers: &'a [E::DeltaLayer],
-    ctx: &'a E::RequestContext,
-) -> MergeDeltaKeys<'a, E> {
-    // Use a binary heap to merge the layers. Each input layer is initially
-    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
-    // the layer's key range as the key. The first time a layer reaches the top
-    // of the heap, all the keys of the layer are loaded into a sorted vector.
-    //
-    // This helps to keep the memory usage reasonable: we only need to hold in
-    // memory the DeltaEntrys of the layers that overlap with the "current" key.
-    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
-    for l in layers {
-        heap.push(LazyLoadLayer::Unloaded(l));
-    }
-    MergeDeltaKeys {
-        heap,
-        ctx,
-        load_future: None,
-    }
-}
-
-enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
-    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
-    Unloaded(&'a E::DeltaLayer),
-}
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
-        match self {
-            Self::Loaded(entries) => entries.front().unwrap().key(),
-            Self::Unloaded(dl) => dl.key_range().start,
-        }
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
-    fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
-
-type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
-
-// Stream returned by `merge_delta_keys`
-pin_project! {
-#[allow(clippy::type_complexity)]
-pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
-    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
-
-    #[pin]
-    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
-
-    ctx: &'a E::RequestContext,
-}
-}
-
-impl<'a, E> Stream for MergeDeltaKeys<'a, E>
-where
-    E: CompactionJobExecutor + 'a,
-{
-    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
-
-    fn poll_next(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
-        let mut this = self.project();
-        loop {
-            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
-                // We are waiting for loading the keys to finish
-                match ready!(load_future.as_mut().poll(cx)) {
-                    Ok(entries) => {
-                        this.load_future.set(None);
-                        *this.heap.peek_mut().unwrap() =
-                            LazyLoadLayer::Loaded(VecDeque::from(entries));
-                    }
-                    Err(e) => {
-                        return Poll::Ready(Some(Err(e)));
-                    }
-                }
-            }
-
-            // If the topmost layer in the heap hasn't been loaded yet, start
-            // loading it. Otherwise return the next entry from it and update
-            // the layer's position in the heap (this decreaseKey operation is
-            // performed implicitly when `top` is dropped).
-            if let Some(mut top) = this.heap.peek_mut() {
-                match top.deref_mut() {
-                    LazyLoadLayer::Unloaded(ref mut l) => {
-                        let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(fut));
-                        continue;
-                    }
-                    LazyLoadLayer::Loaded(ref mut entries) => {
-                        let result = entries.pop_front().unwrap();
-                        if entries.is_empty() {
-                            std::collections::binary_heap::PeekMut::pop(top);
-                        }
-                        return Poll::Ready(Some(Ok(result)));
-                    }
-                }
-            } else {
-                return Poll::Ready(None);
-            }
-        }
-    }
-}
-
-// Accumulate values at key boundaries
-pub struct KeySize<K> {
-    pub key: K,
-    pub num_values: u64,
-    pub size: u64,
-}
-
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
-where
-    K: Eq,
-    I: Stream<Item = Result<D, E>>,
-    D: CompactionDeltaEntry<'a, K>,
-{
-    async_stream::try_stream! {
-        // Initialize the state from the first value
-        let mut input = std::pin::pin!(input);
-
-        if let Some(first) = input.next().await {
-            let first = first?;
-            let mut accum: KeySize<K> = KeySize {
-                key: first.key(),
-                num_values: 1,
-                size: first.size(),
-            };
-            while let Some(this) = input.next().await {
-                let this = this?;
-                if this.key() == accum.key {
-                    accum.size += this.size();
-                    accum.num_values += 1;
-                } else {
-                    yield accum;
-                    accum = KeySize {
-                        key: this.key(),
-                        num_values: 1,
-                        size: this.size(),
-                    };
-                }
-            }
-            yield accum;
-        }
-    }
-}
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,376 +0,0 @@
-//! An LSM tree consists of multiple levels, each exponential larger than the
-//! previous level. And each level consists of be multiple "tiers". With tiered
-//! compaction, a level is compacted when it has accumulated more than N tiers,
-//! forming one tier on the next level.
-//!
-//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
-//! we identify them by looking at the shapes of the layers. It's an easy task
-//! for a human, but it's not straightforward to come up with the exact
-//! rules. Especially if there are cases like interrupted, half-finished
-//! compactions, or highly skewed data distributions that have let us "skip"
-//! some levels. It's not critical to classify all cases correctly; at worst we
-//! delay some compaction work, and suffer from more read amplification, or we
-//! perform some unnecessary compaction work.
-//!
-//! `identify_level` performs that shape-matching.
-//!
-//! It returns a Level struct, which has `depth()` function to count the number
-//! of "tiers" in the level. The tier count is the max depth of stacked layers
-//! within the level. That's a good measure, because the point of compacting is
-//! to reduce read amplification, and the depth is what determines that.
-//!
-//! One interesting effect of this is that if we generate very small delta
-//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
-//! because they reach the target size, the L0 compaction will combine them to
-//! one larger file. But if the combined file is still smaller than the target
-//! file size, the file will still be considered to be part of L0 at the next
-//! iteration.
-
-use anyhow::bail;
-use std::collections::BTreeSet;
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-use crate::interface::*;
-
-use tracing::{info, trace};
-
-pub struct Level<L> {
-    pub lsn_range: Range<Lsn>,
-    pub layers: Vec<L>,
-}
-
-/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
-/// no layers that cross the boundary LSN.
-///
-/// A further restriction is that all layers in the returned partition cover at
-/// most 'lsn_max_size' LSN bytes.
-pub async fn identify_level<K, L>(
-    all_layers: Vec<L>,
-    end_lsn: Lsn,
-    lsn_max_size: u64,
-) -> anyhow::Result<Option<Level<L>>>
-where
-    K: CompactionKey,
-    L: CompactionLayer<K> + Clone,
-{
-    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
-    let mut layers = Vec::new();
-    for l in all_layers {
-        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
-            // shouldn't happen. Indicates that the caller passed a bogus
-            // end_lsn.
-            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
-        }
-        // include image layers sitting exacty at `end_lsn`.
-        let is_image = !l.is_delta();
-        if (is_image && l.lsn_range().start > end_lsn)
-            || (!is_image && l.lsn_range().start >= end_lsn)
-        {
-            continue;
-        }
-        layers.push(l);
-    }
-    // All the remaining layers either belong to this level, or are below it.
-    info!(
-        "identify level at {}, size {}, num layers below: {}",
-        end_lsn,
-        lsn_max_size,
-        layers.len()
-    );
-    if layers.is_empty() {
-        return Ok(None);
-    }
-
-    // Walk the ranges in LSN order.
-    //
-    // ----- end_lsn
-    //  |
-    //  |
-    //  v
-    //
-    layers.sort_by_key(|l| l.lsn_range().end);
-    let mut candidate_start_lsn = end_lsn;
-    let mut candidate_layers: Vec<L> = Vec::new();
-    let mut current_best_start_lsn = end_lsn;
-    let mut current_best_layers: Vec<L> = Vec::new();
-    let mut iter = layers.into_iter();
-    loop {
-        let Some(l) = iter.next_back() else {
-            // Reached end. Accept the last candidate
-            current_best_start_lsn = candidate_start_lsn;
-            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-            break;
-        };
-        trace!(
-            "inspecting {} for candidate {}, current best {}",
-            l.short_id(),
-            candidate_start_lsn,
-            current_best_start_lsn
-        );
-
-        let r = l.lsn_range();
-
-        // Image layers don't restrict our choice of cutoff LSN
-        if l.is_delta() {
-            // Is this candidate workable? In other words, are there any
-            // delta layers that span across this LSN
-            //
-            // Valid:                 Not valid:
-            //  +                     +
-            //  |                     | +
-            //  +  <- candidate       + |   <- candidate
-            //     +                    +
-            //     |
-            //     +
-            if r.end <= candidate_start_lsn {
-                // Hooray, there are no crossing LSNs. And we have visited
-                // through all the layers within candidate..end_lsn. The
-                // current candidate can be accepted.
-                current_best_start_lsn = r.end;
-                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-                candidate_start_lsn = r.start;
-            }
-
-            // Is it small enough to be considered part of this level?
-            if r.end.0 - r.start.0 > lsn_max_size {
-                // Too large, this layer belongs to next level. Stop.
-                trace!(
-                    "too large {}, size {} vs {}",
-                    l.short_id(),
-                    r.end.0 - r.start.0,
-                    lsn_max_size
-                );
-                break;
-            }
-
-            // If this crosses the candidate lsn, push it down.
-            if r.start < candidate_start_lsn {
-                trace!(
-                    "layer {} prevents from stopping at {}",
-                    l.short_id(),
-                    candidate_start_lsn
-                );
-                candidate_start_lsn = r.start;
-            }
-        }
-
-        // Include this layer in our candidate
-        candidate_layers.push(l);
-    }
-
-    Ok(if current_best_start_lsn == end_lsn {
-        // empty level
-        None
-    } else {
-        Some(Level {
-            lsn_range: current_best_start_lsn..end_lsn,
-            layers: current_best_layers,
-        })
-    })
-}
-
-// helper struct used in depth()
-struct Event<K> {
-    key: K,
-    layer_idx: usize,
-    start: bool,
-}
-
-impl<L> Level<L> {
-    /// Count the number of deltas stacked on each other.
-    pub fn depth<K>(&self) -> u64
-    where
-        K: CompactionKey,
-        L: CompactionLayer<K>,
-    {
-        let mut events: Vec<Event<K>> = Vec::new();
-        for (idx, l) in self.layers.iter().enumerate() {
-            events.push(Event {
-                key: l.key_range().start,
-                layer_idx: idx,
-                start: true,
-            });
-            events.push(Event {
-                key: l.key_range().end,
-                layer_idx: idx,
-                start: false,
-            });
-        }
-        events.sort_by_key(|e| (e.key, e.start));
-
-        // Sweep the key space left to right. Stop at each distinct key, and
-        // count the number of deltas on top of the highest image at that key.
-        //
-        // This is a little enefficient, as we walk through the active_set on
-        // every key. We could increment/decrement a counter on each step
-        // instead, but that'd require a bit more complex bookkeeping.
-        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
-        let mut max_depth = 0;
-        let mut events_iter = events.iter().peekable();
-        while let Some(e) = events_iter.next() {
-            let l = &self.layers[e.layer_idx];
-            let is_image = !l.is_delta();
-
-            // update the active set
-            if e.start {
-                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
-            } else {
-                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
-            }
-
-            // recalculate depth if this was the last event at this point
-            let more_events_at_this_key = events_iter
-                .peek()
-                .map_or(false, |next_e| next_e.key == e.key);
-            if !more_events_at_this_key {
-                let mut active_depth = 0;
-                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
-                    if *is_image {
-                        break;
-                    }
-                    active_depth += 1;
-                }
-                if active_depth > max_depth {
-                    max_depth = active_depth;
-                }
-            }
-        }
-        max_depth
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
-    use std::sync::{Arc, Mutex};
-
-    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
-        MockLayer::Delta(Arc::new(MockDeltaLayer {
-            key_range,
-            lsn_range,
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-            records: vec![],
-        }))
-    }
-
-    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
-        MockLayer::Image(Arc::new(MockImageLayer {
-            key_range,
-            lsn_range: lsn..(lsn + 1),
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-        }))
-    }
-
-    #[tokio::test]
-    async fn test_identify_level() -> anyhow::Result<()> {
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
-        ];
-
-        // All layers fit in the max file size
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 6);
-
-        // Same LSN with smaller max file size. The second layer from the top is larger
-        // and belongs to next level.
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        // Call with a smaller LSN
-        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 2);
-
-        // Call with an LSN that doesn't partition the space
-        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
-        // The files LSN ranges overlap, so even though there are more files that
-        // fit under the file size, they are not included in the level because they
-        // overlap so that we'd need to include the oldest file, too, which is
-        // larger
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
-        // The key ranges don't overlap, so depth is only 1.
-        let layers = vec![
-            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
-            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 1);
-
-        // Staggered. The 1st and 3rd layer don't overlap with each other.
-        let layers = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 2);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_images() -> anyhow::Result<()> {
-        let layers: Vec<MockLayer> = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-            // This covers the same key range as the 2nd delta layer. The depth
-            // in that key range is therefore 0.
-            image(1500..2500, Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 4);
-        assert_eq!(level.depth(), 1);
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -1,167 +0,0 @@
-//! This is what the compaction implementation needs to know about
-//! layers, keyspace etc.
-//!
-//! All the heavy lifting is done by the create_image and create_delta
-//! functions that the implementor provides.
-use async_trait::async_trait;
-use pageserver_api::{key::Key, keyspace::key_range_size};
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-/// Public interface. This is the main thing that the implementor needs to provide
-#[async_trait]
-pub trait CompactionJobExecutor {
-    // Type system.
-    //
-    // We assume that there are two kinds of layers, deltas and images. The
-    // compaction doesn't distinguish whether they are stored locally or
-    // remotely.
-    //
-    // The keyspace is defined by CompactionKey trait.
-    //
-    type Key: CompactionKey;
-
-    type Layer: CompactionLayer<Self::Key> + Clone;
-    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
-    type ImageLayer: CompactionImageLayer<Self> + Clone;
-
-    // This is passed through to all the interface functions. The compaction
-    // implementation doesn't do anything with it, but it might be useful for
-    // the interface implementation.
-    type RequestContext: CompactionRequestContext;
-
-    // ----
-    // Functions that the planner uses to support its decisions
-    // ----
-
-    /// Return all layers that overlap the given bounding box.
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>>;
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn: Lsn,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
-
-    /// NB: This is a pretty expensive operation. In the real pageserver
-    /// implementation, it downloads the layer, and keeps it resident
-    /// until the DeltaLayer is dropped.
-    async fn downcast_delta_layer(
-        &self,
-        layer: &Self::Layer,
-    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
-
-    // ----
-    // Functions to execute the plan
-    // ----
-
-    /// Create a new image layer, materializing all the values in the key range,
-    /// at given 'lsn'.
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Self::Key>,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-
-    /// Create a new delta layer, containing all the values from 'input_layers'
-    /// in the given key and LSN range.
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Self::Key>,
-        input_layers: &[Self::DeltaLayer],
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-
-    /// Delete a layer. The compaction implementation will call this only after
-    /// all the create_image() or create_delta() calls that deletion of this
-    /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage,
-    /// it is the implementation's responsibility to track those.
-    async fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-}
-
-pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
-    const MIN: Self;
-    const MAX: Self;
-
-    /// Calculate distance between key_range.start and key_range.end.
-    ///
-    /// This returns u32, for compatibility with Repository::key. If the
-    /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
-
-    // return "self + 1"
-    fn next(&self) -> Self;
-
-    // return "self + <some decent amount to skip>". The amount to skip
-    // is left to the implementation.
-    // FIXME: why not just "add(u32)" ?  This is hard to use
-    fn skip_some(&self) -> Self;
-}
-
-impl CompactionKey for Key {
-    const MIN: Self = Self::MIN;
-    const MAX: Self = Self::MAX;
-
-    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
-        key_range_size(r)
-    }
-    fn next(&self) -> Key {
-        (self as &Key).next()
-    }
-    fn skip_some(&self) -> Key {
-        self.add(128)
-    }
-}
-
-/// Contiguous ranges of keys that belong to the key space. In key order, and
-/// with no overlap.
-pub type CompactionKeySpace<K> = Vec<Range<K>>;
-
-/// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
-    fn key_range(&self) -> &Range<K>;
-    fn lsn_range(&self) -> &Range<Lsn>;
-
-    fn file_size(&self) -> u64;
-
-    /// For debugging, short human-readable representation of the layer. E.g. filename.
-    fn short_id(&self) -> String;
-
-    fn is_delta(&self) -> bool;
-}
-
-#[async_trait]
-pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
-    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
-    where
-        Self: 'a;
-
-    /// Return all keys in this delta layer.
-    async fn load_keys<'a>(
-        &self,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
-}
-
-pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
-
-pub trait CompactionDeltaEntry<'a, K> {
-    fn key(&self) -> K;
-    fn lsn(&self) -> Lsn;
-    fn size(&self) -> u64;
-}
-
-pub trait CompactionRequestContext {}
--- a/pageserver/compaction/src/lib.rs
+++ b/pageserver/compaction/src/lib.rs
@@ -1,12 +0,0 @@
-// The main module implementing the compaction algorithm
-pub mod compact_tiered;
-pub(crate) mod identify_levels;
-
-// Traits that the caller of the compaction needs to implement
-pub mod interface;
-
-// Utility functions, useful for the implementation
-pub mod helpers;
-
-// A simulator with mock implementations of 'interface'
-pub mod simulator;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -1,613 +0,0 @@
-mod draw;
-
-use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
-
-use async_trait::async_trait;
-use futures::StreamExt;
-use rand::Rng;
-use tracing::info;
-
-use utils::lsn::Lsn;
-
-use std::fmt::Write;
-use std::ops::Range;
-use std::sync::Arc;
-use std::sync::Mutex;
-
-use crate::helpers::{merge_delta_keys, overlaps_with};
-
-use crate::interface;
-use crate::interface::CompactionLayer;
-
-//
-// Implementation for the CompactionExecutor interface
-//
-pub struct MockTimeline {
-    // Parameters for the compaction algorithm
-    pub target_file_size: u64,
-    tiers_per_level: u64,
-
-    num_l0_flushes: u64,
-    last_compact_at_flush: u64,
-    last_flush_lsn: Lsn,
-
-    // In-memory layer
-    records: Vec<MockRecord>,
-    total_len: u64,
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-
-    // Current keyspace at `end_lsn`. This is updated on every ingested record.
-    keyspace: KeySpace,
-
-    // historic keyspaces
-    old_keyspaces: Vec<(Lsn, KeySpace)>,
-
-    // "on-disk" layers
-    pub live_layers: Vec<MockLayer>,
-
-    num_deleted_layers: u64,
-
-    // Statistics
-    wal_ingested: u64,
-    bytes_written: u64,
-    bytes_deleted: u64,
-    layers_created: u64,
-    layers_deleted: u64,
-
-    // All the events - creation and deletion of files - are collected
-    // in 'history'. It is used to draw the SVG animation at the end.
-    time: u64,
-    history: Vec<draw::LayerTraceEvent>,
-}
-
-type KeySpace = interface::CompactionKeySpace<Key>;
-
-pub struct MockRequestContext {}
-impl interface::CompactionRequestContext for MockRequestContext {}
-
-pub type Key = u64;
-
-impl interface::CompactionKey for Key {
-    const MIN: Self = u64::MIN;
-    const MAX: Self = u64::MAX;
-
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
-        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
-    }
-
-    fn next(&self) -> Self {
-        self + 1
-    }
-    fn skip_some(&self) -> Self {
-        // round up to next xx
-        self + 100
-    }
-}
-
-#[derive(Clone)]
-pub struct MockRecord {
-    lsn: Lsn,
-    key: Key,
-    len: u64,
-}
-
-impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
-    fn key(&self) -> Key {
-        self.key
-    }
-    fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-    fn size(&self) -> u64 {
-        self.len
-    }
-}
-
-pub struct MockDeltaLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-
-    pub records: Vec<MockRecord>,
-}
-
-impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}-{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        true
-    }
-}
-
-#[async_trait]
-impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
-    type DeltaEntry<'a> = MockRecord;
-
-    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
-        Ok(self.records.clone())
-    }
-}
-
-pub struct MockImageLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-}
-
-impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
-
-impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        false
-    }
-}
-
-impl MockTimeline {
-    pub fn new() -> Self {
-        MockTimeline {
-            target_file_size: 256 * 1024 * 1024,
-            tiers_per_level: 4,
-
-            num_l0_flushes: 0,
-            last_compact_at_flush: 0,
-            last_flush_lsn: Lsn(0),
-
-            records: Vec::new(),
-            total_len: 0,
-            start_lsn: Lsn(1000),
-            end_lsn: Lsn(1000),
-            keyspace: KeySpace::new(),
-
-            old_keyspaces: vec![],
-
-            live_layers: vec![],
-
-            num_deleted_layers: 0,
-
-            wal_ingested: 0,
-            bytes_written: 0,
-            bytes_deleted: 0,
-            layers_created: 0,
-            layers_deleted: 0,
-
-            time: 0,
-            history: Vec::new(),
-        }
-    }
-
-    pub async fn compact(&mut self) -> anyhow::Result<()> {
-        let ctx = MockRequestContext {};
-
-        crate::compact_tiered::compact_tiered(
-            self,
-            self.last_flush_lsn,
-            self.target_file_size,
-            self.tiers_per_level,
-            &ctx,
-        )
-        .await?;
-
-        Ok(())
-    }
-
-    // Ingest one record to the timeline
-    pub fn ingest_record(&mut self, key: Key, len: u64) {
-        self.records.push(MockRecord {
-            lsn: self.end_lsn,
-            key,
-            len,
-        });
-        self.total_len += len;
-        self.end_lsn += len;
-
-        if self.total_len > self.target_file_size {
-            self.flush_l0();
-        }
-    }
-
-    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
-        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
-            self.compact().await?;
-            self.last_compact_at_flush = self.num_l0_flushes;
-        }
-        Ok(())
-    }
-
-    pub fn flush_l0(&mut self) {
-        if self.records.is_empty() {
-            return;
-        }
-
-        let mut records = std::mem::take(&mut self.records);
-        records.sort_by_key(|rec| rec.key);
-
-        let lsn_range = self.start_lsn..self.end_lsn;
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: Key::MIN..Key::MAX,
-            lsn_range: lsn_range.clone(),
-            file_size: self.total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!("flushed L0 layer {}", new_layer.short_id());
-        self.live_layers.push(MockLayer::from(&new_layer));
-
-        // reset L0
-        self.start_lsn = self.end_lsn;
-        self.total_len = 0;
-        self.records = Vec::new();
-
-        self.layers_created += 1;
-        self.bytes_written += new_layer.file_size;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Flush,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        self.num_l0_flushes += 1;
-        self.last_flush_lsn = self.end_lsn;
-    }
-
-    // Ingest `num_records' records to the timeline, with random keys
-    // uniformly distributed in `key_range`
-    pub fn ingest_uniform(
-        &mut self,
-        num_records: u64,
-        len: u64,
-        key_range: &Range<Key>,
-    ) -> anyhow::Result<()> {
-        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
-        let mut rng = rand::thread_rng();
-        for _ in 0..num_records {
-            self.ingest_record(rng.gen_range(key_range.clone()), len);
-            self.wal_ingested += len;
-        }
-        Ok(())
-    }
-
-    pub fn stats(&self) -> anyhow::Result<String> {
-        let mut s = String::new();
-
-        writeln!(s, "STATISTICS:")?;
-        writeln!(
-            s,
-            "WAL ingested:   {:>10} MB",
-            self.wal_ingested / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size created:   {:>10} MB",
-            self.bytes_written / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size deleted:   {:>10} MB",
-            self.bytes_deleted / (1024 * 1024)
-        )?;
-        writeln!(s, "files created:     {:>10}", self.layers_created)?;
-        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
-        writeln!(
-            s,
-            "write amp:         {:>10.2}",
-            self.bytes_written as f64 / self.wal_ingested as f64
-        )?;
-        writeln!(
-            s,
-            "storage amp:       {:>10.2}",
-            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
-        )?;
-
-        Ok(s)
-    }
-
-    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
-        draw::draw_history(&self.history, output)
-    }
-}
-
-impl Default for MockTimeline {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Clone)]
-pub enum MockLayer {
-    Delta(Arc<MockDeltaLayer>),
-    Image(Arc<MockImageLayer>),
-}
-
-impl interface::CompactionLayer<Key> for MockLayer {
-    fn key_range(&self) -> &Range<Key> {
-        match self {
-            MockLayer::Delta(this) => this.key_range(),
-            MockLayer::Image(this) => this.key_range(),
-        }
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        match self {
-            MockLayer::Delta(this) => this.lsn_range(),
-            MockLayer::Image(this) => this.lsn_range(),
-        }
-    }
-    fn file_size(&self) -> u64 {
-        match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
-        }
-    }
-    fn short_id(&self) -> String {
-        match self {
-            MockLayer::Delta(this) => this.short_id(),
-            MockLayer::Image(this) => this.short_id(),
-        }
-    }
-
-    fn is_delta(&self) -> bool {
-        match self {
-            MockLayer::Delta(_) => true,
-            MockLayer::Image(_) => false,
-        }
-    }
-}
-
-impl MockLayer {
-    fn is_deleted(&self) -> bool {
-        let guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        *guard
-    }
-    fn mark_deleted(&self) {
-        let mut deleted_guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        assert!(!*deleted_guard, "layer already deleted");
-        *deleted_guard = true;
-    }
-}
-
-impl From<&Arc<MockDeltaLayer>> for MockLayer {
-    fn from(l: &Arc<MockDeltaLayer>) -> Self {
-        MockLayer::Delta(l.clone())
-    }
-}
-
-impl From<&Arc<MockImageLayer>> for MockLayer {
-    fn from(l: &Arc<MockImageLayer>) -> Self {
-        MockLayer::Image(l.clone())
-    }
-}
-
-#[async_trait]
-impl interface::CompactionJobExecutor for MockTimeline {
-    type Key = Key;
-    type Layer = MockLayer;
-    type DeltaLayer = Arc<MockDeltaLayer>;
-    type ImageLayer = Arc<MockImageLayer>;
-    type RequestContext = MockRequestContext;
-
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>> {
-        // Clear any deleted layers from our vec
-        self.live_layers.retain(|l| !l.is_deleted());
-
-        let layers: Vec<MockLayer> = self
-            .live_layers
-            .iter()
-            .filter(|l| {
-                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
-            })
-            .cloned()
-            .collect();
-
-        Ok(layers)
-    }
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        _lsn: Lsn,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
-        // find it in the levels
-        if self.old_keyspaces.is_empty() {
-            Ok(crate::helpers::intersect_keyspace(
-                &self.keyspace,
-                key_range,
-            ))
-        } else {
-            // not implemented
-
-            // The mock implementation only allows requesting the
-            // keyspace at the level's end LSN. That's all that the
-            // current implementation needs.
-            panic!("keyspace not available for requested lsn");
-        }
-    }
-
-    async fn downcast_delta_layer(
-        &self,
-        layer: &MockLayer,
-    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
-        Ok(match layer {
-            MockLayer::Delta(l) => Some(l.clone()),
-            MockLayer::Image(_) => None,
-        })
-    }
-
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Key>,
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
-
-        let mut accum_size: u64 = 0;
-        for r in keyspace {
-            accum_size += r.end - r.start;
-        }
-
-        let new_layer = Arc::new(MockImageLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created image layer, size {}: {}",
-            new_layer.file_size,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Image(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += new_layer.file_size;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateImage,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Key>,
-        input_layers: &[Arc<MockDeltaLayer>],
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let mut key_value_stream =
-            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
-        let mut records: Vec<MockRecord> = Vec::new();
-        let mut total_len = 2;
-        while let Some(delta_entry) = key_value_stream.next().await {
-            let delta_entry: MockRecord = delta_entry?;
-            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
-                total_len += delta_entry.len;
-                records.push(delta_entry);
-            }
-        }
-        let total_records = records.len();
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn_range.clone(),
-            file_size: total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created delta layer, recs {}, size {}: {}",
-            total_records,
-            total_len,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += total_len;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateDelta,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        _ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let layer = std::pin::pin!(layer);
-        info!("deleting layer: {}", layer.short_id());
-        self.num_deleted_layers += 1;
-        self.bytes_deleted += layer.file_size();
-        layer.mark_deleted();
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Delete,
-            file: LayerTraceFile {
-                filename: layer.short_id(),
-                key_range: layer.key_range().clone(),
-                lsn_range: layer.lsn_range().clone(),
-            },
-        });
-
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -1,411 +0,0 @@
-use super::Key;
-use anyhow::Result;
-use std::cmp::Ordering;
-use std::{
-    collections::{BTreeMap, BTreeSet, HashSet},
-    fmt::Write,
-    ops::Range,
-};
-use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
-use utils::lsn::Lsn;
-
-// Map values to their compressed coordinate - the index the value
-// would have in a sorted and deduplicated list of all values.
-struct CoordinateMap<T: Ord + Copy> {
-    map: BTreeMap<T, usize>,
-    stretch: f32,
-}
-
-impl<T: Ord + Copy> CoordinateMap<T> {
-    fn new(coords: Vec<T>, stretch: f32) -> Self {
-        let set: BTreeSet<T> = coords.into_iter().collect();
-
-        let mut map: BTreeMap<T, usize> = BTreeMap::new();
-        for (i, e) in set.iter().enumerate() {
-            map.insert(*e, i);
-        }
-
-        Self { map, stretch }
-    }
-
-    // This assumes that the map contains an exact point for this.
-    // Use map_inexact for values inbetween
-    fn map(&self, val: T) -> f32 {
-        *self.map.get(&val).unwrap() as f32 * self.stretch
-    }
-
-    // the value is still assumed to be within the min/max bounds
-    // (this is currently unused)
-    fn _map_inexact(&self, val: T) -> f32 {
-        let prev = *self.map.range(..=val).next().unwrap().1;
-        let next = *self.map.range(val..).next().unwrap().1;
-
-        // interpolate
-        (prev as f32 + (next - prev) as f32) * self.stretch
-    }
-
-    fn max(&self) -> f32 {
-        self.map.len() as f32 * self.stretch
-    }
-}
-
-#[derive(PartialEq, Hash, Eq)]
-pub enum LayerTraceOp {
-    Flush,
-    CreateDelta,
-    CreateImage,
-    Delete,
-}
-
-impl std::fmt::Display for LayerTraceOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        let op_str = match self {
-            LayerTraceOp::Flush => "flush",
-            LayerTraceOp::CreateDelta => "create_delta",
-            LayerTraceOp::CreateImage => "create_image",
-            LayerTraceOp::Delete => "delete",
-        };
-        f.write_str(op_str)
-    }
-}
-
-#[derive(PartialEq, Hash, Eq, Clone)]
-pub struct LayerTraceFile {
-    pub filename: String,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-}
-
-impl LayerTraceFile {
-    fn is_image(&self) -> bool {
-        self.lsn_range.end == self.lsn_range.start
-    }
-}
-
-pub struct LayerTraceEvent {
-    pub time_rel: u64,
-    pub op: LayerTraceOp,
-    pub file: LayerTraceFile,
-}
-
-pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
-    let mut files: Vec<LayerTraceFile> = Vec::new();
-
-    for event in history {
-        files.push(event.file.clone());
-    }
-    let last_time_rel = history.last().unwrap().time_rel;
-
-    // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
-    for f in files.iter() {
-        keys.push(f.key_range.start);
-        keys.push(f.key_range.end);
-        lsns.push(f.lsn_range.start);
-        lsns.push(f.lsn_range.end);
-    }
-
-    // Analyze
-    let key_map = CoordinateMap::new(keys, 2.0);
-    // Stretch out vertically for better visibility
-    let lsn_map = CoordinateMap::new(lsns, 3.0);
-
-    let mut svg = String::new();
-
-    // Draw
-    writeln!(
-        svg,
-        "{}",
-        BeginSvg {
-            w: key_map.max(),
-            h: lsn_map.max(),
-        }
-    )?;
-    let lsn_max = lsn_map.max();
-
-    // Sort the files by LSN, but so that image layers go after all delta layers
-    // The SVG is painted in the order the elements appear, and we want to draw
-    // image layers on top of the delta layers if they overlap
-    //
-    // (This could also be implemented via z coordinates: image layers get one z
-    // coord, delta layers get another z coord.)
-    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
-    files_sorted.sort_by(|a, b| {
-        if a.is_image() && !b.is_image() {
-            Ordering::Greater
-        } else if !a.is_image() && b.is_image() {
-            Ordering::Less
-        } else {
-            a.lsn_range.end.cmp(&b.lsn_range.end)
-        }
-    });
-
-    writeln!(svg, "<!-- layers -->")?;
-    let mut files_seen = HashSet::new();
-    for f in files_sorted {
-        if files_seen.contains(&f) {
-            continue;
-        }
-        let key_start = key_map.map(f.key_range.start);
-        let key_end = key_map.map(f.key_range.end);
-        let key_diff = key_end - key_start;
-
-        if key_start >= key_end {
-            panic!("Invalid key range {}-{}", key_start, key_end);
-        }
-
-        let lsn_start = lsn_map.map(f.lsn_range.start);
-        let lsn_end = lsn_map.map(f.lsn_range.end);
-
-        // Fill in and thicken rectangle if it's an
-        // image layer so that we can see it.
-        let mut style = Style::default();
-        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
-
-        let y_start = lsn_max - lsn_start;
-        let y_end = lsn_max - lsn_end;
-
-        let x_margin = 0.25;
-        let y_margin = 0.5;
-
-        match f.lsn_range.start.cmp(&f.lsn_range.end) {
-            Ordering::Less => {
-                write!(
-                    svg,
-                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end + y_margin,
-                    key_diff - x_margin * 2.0,
-                    y_start - y_end - y_margin * 2.0,
-                    1.0, // border_radius,
-                    style,
-                )?;
-                write!(svg, "<title>{}</title>", f.filename)?;
-                writeln!(svg, "</rect>")?;
-            }
-            Ordering::Equal => {
-                //lsn_diff = 0.3;
-                //lsn_offset = -lsn_diff / 2.0;
-                //margin = 0.05;
-                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
-                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
-                write!(
-                    svg,
-                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end,
-                    key_end - x_margin,
-                    y_end,
-                    style,
-                )?;
-                write!(
-                    svg,
-                    "<title>{}<br>{} - {}</title>",
-                    f.filename, lsn_end, y_end
-                )?;
-                writeln!(svg, "</line>")?;
-            }
-            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
-        }
-        files_seen.insert(f);
-    }
-
-    let mut record_style = Style::default();
-    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-    record_style.stroke = Stroke::None;
-
-    writeln!(svg, "{}", EndSvg)?;
-
-    let mut layer_events_str = String::new();
-    let mut first = true;
-    for e in history {
-        if !first {
-            writeln!(layer_events_str, ",")?;
-        }
-        write!(
-            layer_events_str,
-            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
-            e.time_rel, e.file.filename, e.op
-        )?;
-        first = false;
-    }
-    writeln!(layer_events_str)?;
-
-    writeln!(
-        output,
-        r#"<!DOCTYPE html>
-<html>
-<head>
-<style>
-/* Keep the slider pinned at top */
-.topbar {{
-  display: block;
-  overflow: hidden;
-  background-color: lightgrey;
-  position: fixed;
-  top: 0;
-  width: 100%;
-/*  width: 500px; */
-}}
-.slidercontainer {{
-  float: left;
-  width: 50%;
-  margin-right: 200px;
-}}
-.slider {{
-  float: left;
-  width: 100%;
-}}
-.legend {{
-  width: 200px;
-  float: right;
-}}
-
-/* Main content */
-.main {{
-  margin-top: 50px; /* Add a top margin to avoid content overlay */
-}}
-</style>
-</head>
-
-  <body onload="init()">
-    <script type="text/javascript">
-
-      var layer_events = [{layer_events_str}]
-
-      let ticker;
-
-      function init() {{
-          for (let i = 0; i < layer_events.length; i++) {{
-              var layer = document.getElementById("layer_" + layer_events[i].filename);
-              layer.style.visibility = "hidden";
-          }}
-          last_layer_event = -1;
-          moveSlider(last_slider_pos)
-      }}
-
-      function startAnimation() {{
-          ticker = setInterval(animateStep, 100);
-      }}
-      function stopAnimation() {{
-          clearInterval(ticker);
-      }}
-
-      function animateStep() {{
-          if (last_layer_event < layer_events.length - 1) {{
-              var slider = document.getElementById("time-slider");
-              let prevPos = slider.value
-              let nextEvent = last_layer_event + 1
-              while (nextEvent <= layer_events.length - 1) {{
-                  if (layer_events[nextEvent].time_rel > prevPos) {{
-                      break;
-                  }}
-                  nextEvent += 1;
-              }}
-              let nextPos = layer_events[nextEvent].time_rel
-              slider.value = nextPos
-              moveSlider(nextPos)
-          }}
-      }}
-
-      function redoLayerEvent(n, dir) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "visible";
-                  break;
-              case "delete":
-                  layer.style.visibility = "hidden";
-                  break;
-          }}
-      }}
-      function undoLayerEvent(n) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "delete":
-                  layer.style.visibility = "visible";
-                  break;
-          }}
-      }}
-
-      var last_slider_pos = 0
-      var last_layer_event = 0
-
-      var moveSlider = function(new_pos) {{
-          if (new_pos > last_slider_pos) {{
-              while (last_layer_event < layer_events.length - 1) {{
-                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
-                      break;
-                  }}
-                  last_layer_event += 1;
-                  redoLayerEvent(last_layer_event)
-              }}
-          }}
-          if (new_pos < last_slider_pos) {{
-              while (last_layer_event >= 0) {{
-                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
-                      break;
-                  }}
-                  undoLayerEvent(last_layer_event)
-                  last_layer_event -= 1;
-              }}
-          }}
-          last_slider_pos = new_pos;
-          document.getElementById("debug_pos").textContent=new_pos;
-          if (last_layer_event >= 0) {{
-              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
-          }} else {{
-              document.getElementById("debug_layer_event").textContent="begin";
-          }}
-      }}
-    </script>
-
-    <div class="topbar">
-      <div class="slidercontainer">
-        <label for="time-slider">TIME</label>:
-        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
-
-        pos: <span id="debug_pos"></span><br>
-        event: <span id="debug_layer_event"></span><br>
-        gc: <span id="debug_gc_event"></span><br>
-      </div>
-
-      <button onclick="startAnimation()">Play</button>
-      <button onclick="stopAnimation()">Stop</button>
-
-      <svg class="legend">
-        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-      </svg>
-    </div>
-
-    <div class="main">
-{svg}
-    </div>
-  </body>
-</html>
-"#
-    )?;
-
-    Ok(())
-}
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,35 +0,0 @@
-use pageserver_compaction::interface::CompactionLayer;
-use pageserver_compaction::simulator::MockTimeline;
-
-/// Test the extreme case that there are so many updates for a single key that
-/// even if we produce an extremely narrow delta layer, spanning just that one
-/// key, we still too many records to fit in the target file size. We need to
-/// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
-#[tokio::test]
-async fn test_many_updates_for_single_key() {
-    let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
-
-    // Ingest 100 MB of updates to a single key.
-    for _ in 1..1000 {
-        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
-        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
-        executor.compact().await.unwrap();
-    }
-
-    // Check that all the layers are smaller than the target size (with some slop)
-    for l in executor.live_layers.iter() {
-        println!("layer {}: {}", l.short_id(), l.file_size());
-    }
-    for l in executor.live_layers.iter() {
-        assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
-        if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
-        }
-    }
-}
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, str};

-use pageserver::page_cache::{self, PAGE_SZ};
+use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,15 +100,13 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = VirtualFile::open(path).await?;
-    let file_id = page_cache::next_file_id();
-    let block_reader = FileBlockReader::new(&file, file_id);
-    let summary_blk = block_reader.read_blk(0, ctx).await?;
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        block_reader,
+        file,
    );
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,15 +61,13 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
-    let file = VirtualFile::open(path).await?;
-    let file_id = page_cache::next_file_id();
-    let block_reader = FileBlockReader::new(&file, file_id);
-    let summary_blk = block_reader.read_blk(0, ctx).await?;
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        &block_reader,
+        &file,
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
@@ -85,7 +83,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
            ctx,
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&block_reader);
+    let cursor = BlockCursor::new_fileblockreader(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
 use rand::prelude::*;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{info, instrument};
+use tracing::{debug, info, instrument};

 use std::collections::HashMap;
 use std::num::NonZeroUsize;
@@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
-    page_service_connstring: String,
+    #[clap(long, default_value = "localhost:64000")]
+    page_service_host_port: String,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    #[clap(long, default_value = "1")]
@@ -230,9 +230,12 @@ async fn client(
 ) {
    start_work_barrier.wait().await;

-    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-        .await
-        .unwrap();
+    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
+        &args.page_service_host_port,
+        args.pageserver_jwt.as_deref(),
+    ))
+    .await
+    .unwrap();

    while let Some(Work { lsn, gzip }) = work.recv().await {
        let start = Instant::now();
@@ -260,7 +263,7 @@ async fn client(
                }
            })
            .await;
-        info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
        let elapsed = start.elapsed();
        live_stats.inc();
        STATS.with(|stats| {
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -3,6 +3,7 @@ use utils::logging;

 /// Re-usable pieces of code that aren't CLI-specific.
 mod util {
+    pub(crate) mod connstring;
    pub(crate) mod request_stats;
    #[macro_use]
    pub(crate) mod tokio_thread_local_stats;
--- a/pageserver/pagebench/src/util/connstring.rs
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -0,0 +1,8 @@
+pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
+    let colon_and_jwt = if let Some(jwt) = jwt {
+        format!(":{jwt}") // TODO: urlescape
+    } else {
+        String::new()
+    };
+    format!("postgres://postgres{colon_and_jwt}@{host_port}")
+}
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,7 +14,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
            format!(
                "JWT scope '{:?}' is ineligible for Pageserver auth",
                claims.scope
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -143,7 +143,6 @@ where
    ar: &'a mut Builder<&'b mut W>,
    buf: Vec<u8>,
    current_segment: Option<(SlruKind, u32)>,
-    total_blocks: usize,
 }

 impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
@@ -155,7 +154,6 @@ where
            ar,
            buf: Vec::new(),
            current_segment: None,
-            total_blocks: 0,
        }
    }

@@ -201,8 +199,7 @@ where
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
        self.ar.append(&header, self.buf.as_slice()).await?;

-        self.total_blocks += nblocks;
-        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
+        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);

        self.buf.clear();

@@ -210,15 +207,11 @@ where
    }

    async fn finish(mut self) -> anyhow::Result<()> {
-        let res = if self.current_segment.is_none() || self.buf.is_empty() {
-            Ok(())
-        } else {
-            self.flush().await
-        };
+        if self.current_segment.is_none() || self.buf.is_empty() {
+            return Ok(());
+        }

-        info!("Collected {} SLRU blocks", self.total_blocks);
-
-        res
+        self.flush().await
    }
 }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -20,6 +20,7 @@ use std::num::NonZeroUsize;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use toml_edit;
 use toml_edit::{Document, Item};

 use camino::{Utf8Path, Utf8PathBuf};
@@ -33,7 +34,6 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
-use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -87,10 +87,6 @@ pub mod defaults {

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

-    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
-
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -130,10 +126,6 @@ pub mod defaults {

 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'

-#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
-
-#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -211,9 +203,9 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
-    ///
-    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
+    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
+    /// loading such tenants, vs. other work in the system.
    pub concurrent_tenant_warmup: ConfigurableSemaphore,

    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
@@ -270,10 +262,6 @@ pub struct PageServerConf {
    pub virtual_file_io_engine: virtual_file::IoEngineKind,

    pub get_vectored_impl: GetVectoredImpl,
-
-    pub max_vectored_read_bytes: MaxVectoredReadBytes,
-
-    pub validate_vectored_get: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -362,10 +350,6 @@ struct PageServerConfigBuilder {
    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,

    get_vectored_impl: BuilderValue<GetVectoredImpl>,
-
-    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
-
-    validate_vectored_get: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -445,10 +429,6 @@ impl Default for PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
-            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
-                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
-            )),
-            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
        }
    }
 }
@@ -613,14 +593,6 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }

-    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
-        self.max_vectored_read_bytes = BuilderValue::Set(value);
-    }
-
-    pub fn get_validate_vectored_get(&mut self, value: bool) {
-        self.validate_vectored_get = BuilderValue::Set(value);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -734,12 +706,6 @@ impl PageServerConfigBuilder {
            get_vectored_impl: self
                .get_vectored_impl
                .ok_or(anyhow!("missing get_vectored_impl"))?,
-            max_vectored_read_bytes: self
-                .max_vectored_read_bytes
-                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
-            validate_vectored_get: self
-                .validate_vectored_get
-                .ok_or(anyhow!("missing validate_vectored_get"))?,
        })
    }
 }
@@ -986,15 +952,6 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
-                "max_vectored_read_bytes" => {
-                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
-                    builder.get_max_vectored_read_bytes(
-                        MaxVectoredReadBytes(
-                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
-                }
-                "validate_vectored_get" => {
-                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1070,11 +1027,6 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-            max_vectored_read_bytes: MaxVectoredReadBytes(
-                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                    .expect("Invalid default constant"),
-            ),
-            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
        }
    }
 }
@@ -1202,7 +1154,10 @@ impl ConfigurableSemaphore {

 #[cfg(test)]
 mod tests {
-    use std::{fs, num::NonZeroU32};
+    use std::{
+        fs,
+        num::{NonZeroU32, NonZeroUsize},
+    };

    use camino_tempfile::{tempdir, Utf8TempDir};
    use pageserver_api::models::EvictionPolicy;
@@ -1306,11 +1261,6 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1376,11 +1326,6 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -17,7 +17,7 @@ use tracing::*;
 use utils::id::NodeId;

 mod metrics;
-use crate::consumption_metrics::metrics::MetricsKey;
+use metrics::MetricsKey;
 mod disk_cache;
 mod upload;

--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,5 +1,7 @@
 use super::*;
 use std::collections::HashMap;
+use std::time::SystemTime;
+use utils::lsn::Lsn;

 #[test]
 fn startup_collected_timeline_metrics_before_advancing() {
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,10 +2,10 @@ use std::collections::HashMap;

 use futures::Future;
 use pageserver_api::{
-    shard::TenantShardId,
-    upcall_api::{
+    control_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
    },
+    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -20,9 +20,10 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
 use thiserror::Error;
+use tokio;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use tracing::{debug, error};
+use tracing::{self, debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -725,7 +726,7 @@ mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
    use pageserver_api::shard::ShardIndex;
-    use std::io::ErrorKind;
+    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -734,7 +735,10 @@ mod test {
    use crate::{
        control_plane_client::RetryForeverError,
        repository::Key,
-        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
+        tenant::{
+            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
+            storage_layer::DeltaFileName,
+        },
    };

    use super::*;
@@ -1157,8 +1161,13 @@ mod test {
 pub(crate) mod mock {
    use tracing::info;

+    use crate::tenant::remote_timeline_client::remote_layer_path;
+
    use super::*;
-    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    };

    pub struct ConsumerState {
        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -58,7 +58,6 @@ use utils::{completion, id::TimelineId};

 use crate::{
    config::PageServerConf,
-    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        self,
@@ -66,6 +65,7 @@ use crate::{
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
+        Timeline,
    },
 };

@@ -409,23 +409,13 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let (candidates, collection_time) = {
-        let started_at = std::time::Instant::now();
+    let candidates =
        match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
            EvictionCandidates::Cancelled => {
                return Ok(IterationOutcome::Cancelled);
            }
-            EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
-        }
-    };
-
-    METRICS.layers_collected.inc_by(candidates.len() as u64);
-
-    tracing::info!(
-        elapsed_ms = collection_time.as_millis(),
-        total_layers = candidates.len(),
-        "collection completed"
-    );
+            EvictionCandidates::Finished(partitioned) => partitioned,
+        };

    // Debug-log the list of candidates
    let now = SystemTime::now();
@@ -456,10 +446,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.

-    let (evicted_amount, usage_planned) =
-        select_victims(&candidates, usage_pre).into_amount_and_planned();
+    let selection = select_victims(&candidates, usage_pre);

-    METRICS.layers_selected.inc_by(evicted_amount as u64);
+    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();

    // phase2: evict layers

@@ -488,15 +477,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            if let Some(next) = next {
                match next {
                    Ok(Ok(file_size)) => {
-                        METRICS.layers_evicted.inc();
                        usage_assumed.add_available_bytes(file_size);
                    }
-                    Ok(Err((
-                        file_size,
-                        EvictionError::NotFound
-                        | EvictionError::Downloaded
-                        | EvictionError::Timeout,
-                    ))) => {
+                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
                        evictions_failed.file_sizes += file_size;
                        evictions_failed.count += 1;
                    }
@@ -512,10 +495,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

            // calling again when consumed_all is fine as evicted is fused.
            let Some((_partition, candidate)) = evicted.next() else {
-                if !consumed_all {
-                    tracing::info!("all evictions started, waiting");
-                    consumed_all = true;
-                }
+                consumed_all = true;
                continue;
            };

@@ -523,15 +503,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                EvictionLayer::Attached(layer) => {
                    let file_size = layer.layer_desc().file_size;
                    js.spawn(async move {
-                        // have a low eviction waiting timeout because our LRU calculations go stale fast;
-                        // also individual layer evictions could hang because of bugs and we do not want to
-                        // pause disk_usage_based_eviction for such.
-                        let timeout = std::time::Duration::from_secs(5);
-
-                        match layer.evict_and_wait(timeout).await {
-                            Ok(()) => Ok(file_size),
-                            Err(e) => Err((file_size, e)),
-                        }
+                        layer
+                            .evict_and_wait()
+                            .await
+                            .map(|()| file_size)
+                            .map_err(|e| (file_size, e))
                    });
                }
                EvictionLayer::Secondary(layer) => {
@@ -553,30 +529,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        (usage_assumed, evictions_failed)
    };

-    let started_at = std::time::Instant::now();
-
-    let evict_layers = async move {
-        let mut evict_layers = std::pin::pin!(evict_layers);
-
-        let maximum_expected = std::time::Duration::from_secs(10);
-
-        let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
-        let tuple = if let Ok(tuple) = res {
-            tuple
-        } else {
-            let elapsed = started_at.elapsed();
-            tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
-            evict_layers.await
-        };
-
-        let elapsed = started_at.elapsed();
-        tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
-        tuple
-    };
-
-    let evict_layers =
-        evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
-
    let (usage_assumed, evictions_failed) = tokio::select! {
        tuple = evict_layers => { tuple },
        _ = cancel.cancelled() => {
@@ -811,8 +763,6 @@ async fn collect_eviction_candidates(
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
-    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
-
    // get a snapshot of the list of tenants
    let tenants = tenant::mgr::list_tenants()
        .await
@@ -841,8 +791,6 @@ async fn collect_eviction_candidates(
            continue;
        }

-        let started_at = std::time::Instant::now();
-
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
@@ -857,7 +805,6 @@ async fn collect_eviction_candidates(
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
-
            tenant_candidates.extend(info.resident_layers.into_iter());
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));

@@ -923,25 +870,7 @@ async fn collect_eviction_candidates(
                    (partition, candidate)
                });

-        METRICS
-            .tenant_layer_count
-            .observe(tenant_candidates.len() as f64);
-
        candidates.extend(tenant_candidates);
-
-        let elapsed = started_at.elapsed();
-        METRICS
-            .tenant_collection_time
-            .observe(elapsed.as_secs_f64());
-
-        if elapsed > LOG_DURATION_THRESHOLD {
-            tracing::info!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
-                elapsed_ms = elapsed.as_millis(),
-                "collection took longer than threshold"
-            );
-        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -956,11 +885,11 @@ async fn collect_eviction_candidates(
        },
    );

-    for tenant in secondary_tenants {
+    for secondary_tenant in secondary_tenants {
        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
        // to prevent repeated disk usage based evictions from completely draining less often
        // updating secondaries.
-        let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
+        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();

        debug_assert!(
            total_layers >= layer_info.resident_layers.len(),
@@ -968,8 +897,6 @@ async fn collect_eviction_candidates(
            layer_info.resident_layers.len()
        );

-        let started_at = std::time::Instant::now();
-
        layer_info
            .resident_layers
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -991,27 +918,9 @@ async fn collect_eviction_candidates(
                    )
                });

-        METRICS
-            .tenant_layer_count
-            .observe(tenant_candidates.len() as f64);
        candidates.extend(tenant_candidates);

        tokio::task::yield_now().await;
-
-        let elapsed = started_at.elapsed();
-
-        METRICS
-            .tenant_collection_time
-            .observe(elapsed.as_secs_f64());
-
-        if elapsed > LOG_DURATION_THRESHOLD {
-            tracing::info!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
-                elapsed_ms = elapsed.as_millis(),
-                "collection took longer than threshold"
-            );
-        }
    }

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -1088,6 +997,30 @@ impl<U: Usage> VictimSelection<U> {
    }
 }

+struct TimelineKey(Arc<Timeline>);
+
+impl PartialEq for TimelineKey {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+}
+
+impl Eq for TimelineKey {}
+
+impl std::hash::Hash for TimelineKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        Arc::as_ptr(&self.0).hash(state);
+    }
+}
+
+impl std::ops::Deref for TimelineKey {
+    type Target = Timeline;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
 /// A totally ordered f32 subset we can use with sorting functions.
 pub(crate) mod finite_f32 {

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -579,12 +579,6 @@ paths:
        required: false
        schema:
          type: integer
-      - name: lazy
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
    put:
      description: |
        Configures a _tenant location_, that is how a particular pageserver handles
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -661,14 +661,9 @@ async fn timeline_detail_handler(

    // Logical size calculation needs downloading.
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let state = get_state(&request);

    let timeline_info = async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -701,7 +696,6 @@ async fn get_lsn_by_timestamp_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
@@ -718,10 +712,7 @@ async fn get_lsn_by_timestamp_handler(
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
@@ -752,7 +743,6 @@ async fn get_timestamp_of_lsn_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
@@ -769,9 +759,7 @@ async fn get_timestamp_of_lsn_handler(
        .map_err(ApiError::BadRequest)?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;

    match result {
@@ -816,7 +804,13 @@ async fn tenant_attach_handler(

    let tenant = state
        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
+        .upsert_location(
+            tenant_shard_id,
+            location_conf,
+            None,
+            SpawnMode::Normal,
+            &ctx,
+        )
        .await?;

    let Some(tenant) = tenant else {
@@ -1165,13 +1159,10 @@ async fn layer_map_info_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
        parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
-    let state = get_state(&request);

    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let layer_map_info = timeline.layer_map_info(reset).await;

    json_response(StatusCode::OK, layer_map_info)
@@ -1185,11 +1176,8 @@ async fn layer_download_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let downloaded = timeline
        .download_layer(layer_file_name)
        .await
@@ -1213,11 +1201,8 @@ async fn evict_timeline_layer_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
-    let state = get_state(&request);

-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let evicted = timeline
        .evict_layer(layer_file_name)
        .await
@@ -1412,7 +1397,6 @@ async fn put_tenant_location_config_handler(

    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
    let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
-    let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1443,17 +1427,15 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    // lazy==true queues up for activation or jumps the queue like normal when a compute connects,
-    // similar to at startup ordering.
-    let spawn_mode = if lazy {
-        tenant::SpawnMode::Lazy
-    } else {
-        tenant::SpawnMode::Eager
-    };
-
    let attached = state
        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
+        .upsert_location(
+            tenant_shard_id,
+            location_conf,
+            flush,
+            tenant::SpawnMode::Normal,
+            &ctx,
+        )
        .await?
        .is_some();

@@ -1630,19 +1612,13 @@ async fn timeline_compact_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let state = get_state(&request);
-
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
-    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
-        flags |= CompactFlags::ForceImageLayerCreation;
-    }
-
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        timeline
            .compact(&cancel, flags, &ctx)
            .await
@@ -1662,19 +1638,13 @@ async fn timeline_checkpoint_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let state = get_state(&request);
-
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
-    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
-        flags |= CompactFlags::ForceImageLayerCreation;
-    }
-
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        timeline
            .freeze_and_flush()
            .await
@@ -1699,11 +1669,7 @@ async fn timeline_download_remote_layers_handler_post(
    let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let state = get_state(&request);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    match timeline.spawn_download_all_remote_layers(body).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1717,11 +1683,8 @@ async fn timeline_download_remote_layers_handler_get(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let state = get_state(&request);

-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
@@ -1770,7 +1733,6 @@ async fn getpage_at_lsn_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    struct Key(crate::repository::Key);

@@ -1789,7 +1751,7 @@ async fn getpage_at_lsn_handler(

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;

        let page = timeline.get(key.0, lsn, &ctx).await?;

@@ -1812,13 +1774,12 @@ async fn timeline_collect_keyspace(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -1834,14 +1795,10 @@ async fn timeline_collect_keyspace(
 }

 async fn active_timeline_of_active_tenant(
-    tenant_manager: &TenantManager,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
-
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -642,6 +642,26 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });

+// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
+// or in testing they estimate how much we would upload if we did.
+static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_created_persistent_files_total",
+        "Number of files created that are meant to be uploaded to cloud storage",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_written_persistent_bytes_total",
+        "Total bytes written that are meant to be uploaded to cloud storage",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_eviction_iteration_duration_seconds_global",
@@ -1782,6 +1802,8 @@ pub(crate) struct TimelineMetrics {
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
+    pub num_persistent_files_created: IntCounter,
+    pub persistent_bytes_written: IntCounter,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }
@@ -1863,6 +1885,12 @@ impl TimelineMetrics {
        };
        let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
            Lazy::new(Box::new(directory_entries_count_gauge_closure));
+        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -1884,6 +1912,8 @@ impl TimelineMetrics {
            resident_physical_size_gauge,
            current_logical_size_gauge,
            directory_entries_count_gauge,
+            num_persistent_files_created,
+            persistent_bytes_written,
            evictions,
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
@@ -1893,6 +1923,8 @@ impl TimelineMetrics {

    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
+        self.num_persistent_files_created.inc_by(1);
+        self.persistent_bytes_written.inc_by(sz);
    }

    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
@@ -1915,16 +1947,20 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+            let _ =
+                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
-            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ =
+            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -2473,64 +2509,6 @@ pub(crate) mod tenant_throttling {
    }
 }

-pub(crate) mod disk_usage_based_eviction {
-    use super::*;
-
-    pub(crate) struct Metrics {
-        pub(crate) tenant_collection_time: Histogram,
-        pub(crate) tenant_layer_count: Histogram,
-        pub(crate) layers_collected: IntCounter,
-        pub(crate) layers_selected: IntCounter,
-        pub(crate) layers_evicted: IntCounter,
-    }
-
-    impl Default for Metrics {
-        fn default() -> Self {
-            let tenant_collection_time = register_histogram!(
-                "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
-                "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
-                vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
-            )
-            .unwrap();
-
-            let tenant_layer_count = register_histogram!(
-                "pageserver_disk_usage_based_eviction_tenant_collected_layers",
-                "Amount of layers gathered from a tenant",
-                vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
-            )
-            .unwrap();
-
-            let layers_collected = register_int_counter!(
-                "pageserver_disk_usage_based_eviction_collected_layers_total",
-                "Amount of layers collected"
-            )
-            .unwrap();
-
-            let layers_selected = register_int_counter!(
-                "pageserver_disk_usage_based_eviction_select_layers_total",
-                "Amount of layers selected"
-            )
-            .unwrap();
-
-            let layers_evicted = register_int_counter!(
-                "pageserver_disk_usage_based_eviction_evicted_layers_total",
-                "Amount of layers successfully evicted"
-            )
-            .unwrap();
-
-            Self {
-                tenant_collection_time,
-                tenant_layer_count,
-                layers_collected,
-                layers_selected,
-                layers_evicted,
-            }
-        }
-    }
-
-    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
-}
-
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
@@ -2565,7 +2543,6 @@ pub fn preinitialize_metrics() {
    Lazy::force(&TENANT_MANAGER);

    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
-    Lazy::force(&disk_usage_based_eviction::METRICS);

    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -73,6 +73,7 @@

 use std::{
    collections::{hash_map::Entry, HashMap},
+    convert::TryInto,
    sync::{
        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
        Arc, Weak,
@@ -261,9 +262,7 @@ pub struct PageCache {
    size_metrics: &'static PageCacheSizeMetrics,
 }

-struct PinnedSlotsPermit {
-    _permit: tokio::sync::OwnedSemaphorePermit,
-}
+struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);

 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -559,9 +558,9 @@ impl PageCache {
        )
        .await
        {
-            Ok(res) => Ok(PinnedSlotsPermit {
-                _permit: res.expect("this semaphore is never closed"),
-            }),
+            Ok(res) => Ok(PinnedSlotsPermit(
+                res.expect("this semaphore is never closed"),
+            )),
            Err(_timeout) => {
                crate::metrics::page_cache_errors_inc(
                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -27,7 +27,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,6 +44,7 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
+use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
@@ -1114,10 +1115,7 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
        let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => {
-                set_tracing_field_shard_id(tl);
-                tl
-            }
+            Ok(tl) => tl,
            Err(key) => {
                match self
                    .load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1142,6 +1140,9 @@ impl PageServerHandler {
            }
        };

+        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
+        set_tracing_field_shard_id(timeline);
+
        let _timer = timeline
            .query_metrics
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -35,8 +36,6 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

-const MAX_AUX_FILE_DELTAS: usize = 1024;
-
 #[derive(Debug)]
 pub enum LsnForTimestamp {
    /// Found commits both before and after the given timestamp
@@ -158,6 +157,7 @@ impl Timeline {
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
+            pending_aux_files: None,
            pending_directory_entries: Vec::new(),
            lsn,
        }
@@ -873,6 +873,11 @@ pub struct DatadirModification<'a> {
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

+    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
+    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
+    // if AUX_FILES_KEY is already set.
+    pending_aux_files: Option<AuxFilesDirectory>,
+
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1396,28 +1401,19 @@ impl<'a> DatadirModification<'a> {
            Some(Bytes::copy_from_slice(content))
        };

-        let n_files;
-        let mut aux_files = self.tline.aux_files.lock().await;
-        if let Some(mut dir) = aux_files.dir.take() {
+        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
            // We already updated aux files in `self`: emit a delta and update our latest value
-            dir.upsert(file_path.clone(), content.clone());
-            n_files = dir.files.len();
-            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
-                self.put(
-                    AUX_FILES_KEY,
-                    Value::Image(Bytes::from(
-                        AuxFilesDirectory::ser(&dir).context("serialize")?,
-                    )),
-                );
-                aux_files.n_deltas = 0;
-            } else {
-                self.put(
-                    AUX_FILES_KEY,
-                    Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
-                );
-                aux_files.n_deltas += 1;
-            }
-            aux_files.dir = Some(dir);
+
+            self.put(
+                AUX_FILES_KEY,
+                Value::WalRecord(NeonWalRecord::AuxFile {
+                    file_path: file_path.clone(),
+                    content: content.clone(),
+                }),
+            );
+
+            dir.upsert(file_path, content);
+            dir
        } else {
            // Check if the AUX_FILES_KEY is initialized
            match self.get(AUX_FILES_KEY, ctx).await {
@@ -1432,8 +1428,7 @@ impl<'a> DatadirModification<'a> {
                        }),
                    );
                    dir.upsert(file_path, content);
-                    n_files = dir.files.len();
-                    aux_files.dir = Some(dir);
+                    dir
                }
                Err(
                    e @ (PageReconstructError::AncestorStopping(_)
@@ -1460,14 +1455,14 @@ impl<'a> DatadirModification<'a> {
                            AuxFilesDirectory::ser(&dir).context("serialize")?,
                        )),
                    );
-                    n_files = 1;
-                    aux_files.dir = Some(dir);
+                    dir
                }
            }
-        }
+        };

        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, n_files));
+            .push((DirectoryKind::AuxFiles, dir.files.len()));
+        self.pending_aux_files = Some(dir);

        Ok(())
    }
@@ -1498,7 +1493,7 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1537,13 +1532,23 @@ impl<'a> DatadirModification<'a> {
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;

        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
+            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
+
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
+                .into_iter()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();
+
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
            self.pending_updates.clear();
        }

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,7 @@ impl Value {
 mod test {
    use super::*;

+    use bytes::Bytes;
    use utils::bin_ser::BeSer;

    macro_rules! roundtrip {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -109,6 +109,7 @@ pub use pageserver_api::models::TenantState;
 use tokio::sync::Semaphore;

 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
+use toml_edit;
 use utils::{
    crashsafe,
    generation::Generation,
@@ -144,7 +145,6 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
-pub mod vectored_blob_io;

 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
@@ -226,11 +226,7 @@ pub(crate) struct TenantPreload {
 /// When we spawn a tenant, there is a special mode for tenant creation that
 /// avoids trying to read anything from remote storage.
 pub(crate) enum SpawnMode {
-    /// Activate as soon as possible
-    Eager,
-    /// Lazy activation in the background, with the option to skip the queue if the need comes up
-    Lazy,
-    /// Tenant has been created during the lifetime of this process
+    Normal,
    Create,
 }

@@ -703,37 +699,41 @@ impl Tenant {
                    .and_then(|x| x.initial_tenant_load_remote.take());

                enum AttachType<'a> {
-                    /// We are attaching this tenant lazily in the background.
-                    Warmup {
-                        _permit: tokio::sync::SemaphorePermit<'a>,
-                        during_startup: bool
-                    },
-                    /// We are attaching this tenant as soon as we can, because for example an
-                    /// endpoint tried to access it.
+                    // During pageserver startup, we are attaching this tenant lazily in the background
+                    Warmup(tokio::sync::SemaphorePermit<'a>),
+                    // During pageserver startup, we are attaching this tenant as soon as we can,
+                    // because a client tried to access it.
                    OnDemand,
-                    /// During normal operations after startup, we are attaching a tenant, and
-                    /// eager attach was requested.
+                    // During normal operations after startup, we are attaching a tenant.
                    Normal,
                }

-                let attach_type = if matches!(mode, SpawnMode::Lazy) {
-                    // Before doing any I/O, wait for at least one of:
-                    // - A client attempting to access to this tenant (on-demand loading)
-                    // - A permit becoming available in the warmup semaphore (background warmup)
-
+                // Before doing any I/O, wait for either or:
+                // - A client to attempt to access to this tenant (on-demand loading)
+                // - A permit to become available in the warmup semaphore (background warmup)
+                //
+                // Some-ness of init_order is how we know if we're attaching during startup or later
+                // in process lifetime.
+                let attach_type = if init_order.is_some() {
                    tokio::select!(
-                        permit = tenant_clone.activate_now_sem.acquire() => {
-                            let _ = permit.expect("activate_now_sem is never closed");
+                        _ = tenant_clone.activate_now_sem.acquire() => {
                            tracing::info!("Activating tenant (on-demand)");
                            AttachType::OnDemand
                        },
-                        permit = conf.concurrent_tenant_warmup.inner().acquire() => {
-                            let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
-                            tracing::info!("Activating tenant (warmup)");
-                            AttachType::Warmup {
-                                _permit,
-                                during_startup: init_order.is_some()
+                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            match permit_result {
+                                Ok(p) => {
+                                    tracing::info!("Activating tenant (warmup)");
+                                    AttachType::Warmup(p)
+                                }
+                                Err(_) => {
+                                    // This is unexpected: the warmup semaphore should stay alive
+                                    // for the lifetime of init_order.  Log a warning and proceed.
+                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
+                                    AttachType::Normal
+                                }
                            }
+
                        }
                        _ = tenant_clone.cancel.cancelled() => {
                            // This is safe, but should be pretty rare: it is interesting if a tenant
@@ -748,8 +748,6 @@ impl Tenant {
                        },
                    )
                } else {
-                    // SpawnMode::{Create,Eager} always cause jumping ahead of the
-                    // concurrent_tenant_warmup queue
                    AttachType::Normal
                };

@@ -757,7 +755,7 @@ impl Tenant {
                    (SpawnMode::Create, _) => {
                        None
                    },
-                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
+                    (SpawnMode::Normal, Some(remote_storage)) => {
                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
                            .preload(remote_storage, task_mgr::shutdown_token())
@@ -770,7 +768,7 @@ impl Tenant {
                            }
                        }
                    }
-                    (_, None) => {
+                    (SpawnMode::Normal, None) => {
                        let _preload_timer = TENANT.preload.start_timer();
                        None
                    }
@@ -829,7 +827,7 @@ impl Tenant {
                let attached = {
                    let _attach_timer = match mode {
                        SpawnMode::Create => None,
-                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
+                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
                    };
                    tenant_clone.attach(preload, mode, &ctx).await
                };
@@ -851,7 +849,7 @@ impl Tenant {
                // It also prevents the warmup proccess competing with the concurrency limit on
                // logical size calculations: if logical size calculation semaphore is saturated,
                // then warmup will wait for that before proceeding to the next tenant.
-                if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
+                if let AttachType::Warmup(_permit) = attach_type {
                    let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                    tracing::info!("Waiting for initial logical sizes while warming up...");
                    while futs.next().await.is_some() {}
@@ -924,7 +922,7 @@ impl Tenant {
                deleting: false,
                timelines: HashMap::new(),
            },
-            (None, _) => {
+            (None, SpawnMode::Normal) => {
                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
            }
        };
@@ -2383,7 +2381,7 @@ impl Tenant {
            self.tenant_shard_id,
            self.generation,
            self.shard_identity,
-            self.walredo_mgr.clone(),
+            self.walredo_mgr.as_ref().map(Arc::clone),
            resources,
            pg_version,
            state,
@@ -2575,24 +2573,19 @@ impl Tenant {
        legacy_config_path: &Utf8Path,
        location_conf: &LocationConf,
    ) -> anyhow::Result<()> {
+        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
+        Self::persist_tenant_config_legacy(
+            tenant_shard_id,
+            legacy_config_path,
+            &location_conf.tenant_conf,
+        )
+        .await?;
+
        if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // The modern-style LocationConf config file requires a generation to be set. In case someone
-            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
-            // config file that only contains TenantConf.
-            //
-            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
-
+            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
+            // then drop out after writing legacy-style config.
            if attach_conf.generation.is_none() {
-                tracing::info!(
-                    "Running without generations, writing legacy-style tenant config file"
-                );
-                Self::persist_tenant_config_legacy(
-                    tenant_shard_id,
-                    legacy_config_path,
-                    &location_conf.tenant_conf,
-                )
-                .await?;
-
+                tracing::debug!("Running without generations, not writing new-style LocationConf");
                return Ok(());
            }
        }
@@ -3463,8 +3456,9 @@ impl Tenant {
            // Run each timeline's flush in a task holding the timeline's gate: this
            // means that if this function's future is cancelled, the Timeline shutdown
            // will still wait for any I/O in here to complete.
-            let Ok(gate) = timeline.gate.enter() else {
-                continue;
+            let gate = match timeline.gate.enter() {
+                Ok(g) => g,
+                Err(_) => continue,
            };
            let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
            results.push(jh);
@@ -3592,18 +3586,25 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
+    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
+    use std::fs;
+    use std::sync::Arc;
    use utils::logging;
+    use utils::lsn::Lsn;

    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::walredo::apply_neon;
-    use crate::{repository::Key, walrecord::NeonWalRecord};
+    use crate::{
+        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
+    };

    use super::*;
+    use crate::tenant::config::{TenantConf, TenantConfOpt};
    use hex_literal::hex;
-    use utils::id::TenantId;
+    use utils::id::{TenantId, TimelineId};

    pub const TIMELINE_ID: TimelineId =
        TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3627,7 +3628,6 @@ pub(crate) mod harness {
                compaction_target_size: Some(tenant_conf.compaction_target_size),
                compaction_period: Some(tenant_conf.compaction_period),
                compaction_threshold: Some(tenant_conf.compaction_threshold),
-                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                gc_horizon: Some(tenant_conf.gc_horizon),
                gc_period: Some(tenant_conf.gc_period),
                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
@@ -3763,7 +3763,7 @@ pub(crate) mod harness {
            let preload = tenant
                .preload(&self.remote_storage, CancellationToken::new())
                .await?;
-            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
+            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;

            tenant.state.send_replace(TenantState::Active);
            for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3832,8 +3832,10 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
+    use once_cell::sync::Lazy;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
+    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -3845,7 +3847,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3857,7 +3859,7 @@ mod tests {
        writer.finish_write(Lsn(0x10));
        drop(writer);

-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3923,7 +3925,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;

        #[allow(non_snake_case)]
        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -3957,7 +3959,7 @@ mod tests {
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
-        let new_writer = newtline.writer().await;
+        let mut new_writer = newtline.writer().await;
        new_writer
            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
            .await?;
@@ -3989,7 +3991,7 @@ mod tests {
    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
            // Create a relation on the timeline
            writer
                .put(
@@ -4014,7 +4016,7 @@ mod tests {
        }
        tline.freeze_and_flush().await?;
        {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
            writer
                .put(
                    *TEST_KEY,
@@ -4377,7 +4379,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4394,7 +4396,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4411,7 +4413,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4428,7 +4430,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4485,7 +4487,7 @@ mod tests {
        for _ in 0..repeat {
            for _ in 0..key_count {
                test_key.field6 = blknum;
-                let writer = timeline.writer().await;
+                let mut writer = timeline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4656,7 +4658,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4677,7 +4679,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4745,7 +4747,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4774,7 +4776,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4851,7 +4853,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,7 +5,7 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReader(&'a FileBlockReader<'a>),
+    FileBlockReader(&'a FileBlockReader),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
@@ -160,15 +160,17 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader<'a> {
-    pub file: &'a VirtualFile,
+pub struct FileBlockReader {
+    pub file: VirtualFile,

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
 }

-impl<'a> FileBlockReader<'a> {
-    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
+impl FileBlockReader {
+    pub fn new(file: VirtualFile) -> Self {
+        let file_id = page_cache::next_file_id();
+
        FileBlockReader { file_id, file }
    }

@@ -188,11 +190,11 @@ impl<'a> FileBlockReader<'a> {
    /// Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
-    pub async fn read_blk<'b>(
+    pub async fn read_blk(
        &self,
        blknum: u32,
        ctx: &RequestContext,
-    ) -> Result<BlockLease<'b>, std::io::Error> {
+    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        match cache
            .read_immutable_buf(self.file_id, blknum, ctx)
@@ -213,7 +215,7 @@ impl<'a> FileBlockReader<'a> {
    }
 }

-impl BlockReader for FileBlockReader<'_> {
+impl BlockReader for FileBlockReader {
    fn block_cursor(&self) -> BlockCursor<'_> {
        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,7 +9,6 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
-use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -21,7 +20,6 @@ use std::time::Duration;
 use utils::generation::Generation;

 pub mod defaults {
-
    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
    // would be more appropriate. But a low value forces the code to be exercised more,
    // which is good for now to trigger bugs.
@@ -29,17 +27,12 @@ pub mod defaults {
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";

-    // FIXME the below configs are only used by legacy algorithm. The new algorithm
-    // has different parameters.
-
    // Target file size, when creating image and delta layers.
    // This parameter determines L1 layer file size.
    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

@@ -52,10 +45,7 @@ pub mod defaults {
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
-    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
-    // throughputs up to 1GiB/s per timeline.
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
@@ -315,7 +305,6 @@ pub struct TenantConf {
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithm,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -388,10 +377,6 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub compaction_threshold: Option<usize>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub gc_horizon: Option<u64>,
@@ -472,9 +457,6 @@ impl TenantConfOpt {
            compaction_threshold: self
                .compaction_threshold
                .unwrap_or(global_conf.compaction_threshold),
-            compaction_algorithm: self
-                .compaction_algorithm
-                .unwrap_or(global_conf.compaction_algorithm),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -521,7 +503,6 @@ impl Default for TenantConf {
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
@@ -599,7 +580,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
        Self {
            checkpoint_distance: value.checkpoint_distance,
            checkpoint_timeout: value.checkpoint_timeout.map(humantime),
-            compaction_algorithm: value.compaction_algorithm,
            compaction_target_size: value.compaction_target_size,
            compaction_period: value.compaction_period.map(humantime),
            compaction_threshold: value.compaction_threshold,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
            .expect("cant be stopping or broken");

        tenant
-            .attach(preload, super::SpawnMode::Eager, ctx)
+            .attach(preload, super::SpawnMode::Normal, ctx)
            .await
            .context("attach")?;

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -21,6 +21,7 @@
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
+use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
 use tracing::error;
@@ -699,6 +700,8 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 pub(crate) mod tests {
    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
    use rand::Rng;
    use std::collections::BTreeMap;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -300,7 +300,7 @@ mod tests {
    use super::*;
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::BlockReaderRef;
+    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
            shard_identity,
            Some(init_order.clone()),
            &TENANTS,
-            SpawnMode::Lazy,
+            SpawnMode::Normal,
            &ctx,
        ) {
            Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {

                // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
                // the caller thinks they're creating but the tenant already existed.  We must switch to
-                // Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
                // rather than assuming it to be empty.
-                spawn_mode = SpawnMode::Eager;
+                spawn_mode = SpawnMode::Normal;
            }
            Some(TenantSlot::Secondary(state)) => {
                info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
            shard_identity,
            None,
            self.tenants,
-            SpawnMode::Eager,
+            SpawnMode::Normal,
            ctx,
        )?;

@@ -1521,7 +1521,7 @@ impl TenantManager {
                *child_shard,
                child_location_conf,
                None,
-                SpawnMode::Eager,
+                SpawnMode::Normal,
                ctx,
            )
            .await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
        shard_identity,
        None,
        &TENANTS,
-        SpawnMode::Eager,
+        SpawnMode::Normal,
        ctx,
    )
    .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(

    let tenant = guard
        .get(&tenant_shard_id)
-        .cloned()
+        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_shard_id}"))
        .map_err(|e| ApiError::NotFound(e.into()))?;

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1791,12 +1791,14 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant, Timeline,
+            storage_layer::Layer,
+            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };

    use std::collections::HashSet;
+    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
        format!("contents for {name}").into()
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(

 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";

-pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
+pub fn is_temp_download_file(path: &Utf8Path) -> bool {
    let extension = path.extension();
    match extension {
        Some(TEMP_DOWNLOAD_EXTENSION) => true,
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
-use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
+use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};

 enum DownloadCommand {
    Download(TenantShardId),
@@ -121,10 +121,6 @@ impl SecondaryTenant {
        })
    }

-    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
-        self.tenant_shard_id
-    }
-
    pub(crate) async fn shutdown(&self) {
        self.cancel.cancel();

@@ -168,17 +164,16 @@ impl SecondaryTenant {
        self.detail.lock().unwrap().get_layers_for_eviction(self)
    }

-    /// Cancellation safe, but on cancellation the eviction will go through
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
    pub(crate) async fn evict_layer(
-        self: &Arc<Self>,
+        &self,
        conf: &PageServerConf,
        timeline_id: TimelineId,
        name: LayerFileName,
    ) {
        debug_assert_current_span_has_tenant_id();

-        let guard = match self.gate.enter() {
+        let _guard = match self.gate.enter() {
            Ok(g) => g,
            Err(_) => {
                tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
@@ -192,57 +187,35 @@ impl SecondaryTenant {
            .timeline_path(&self.tenant_shard_id, &timeline_id)
            .join(name.file_name());

-        let this = self.clone();
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        tokio::fs::remove_file(path)
+            .await
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting layer during eviction");

-        // spawn it to be cancellation safe
-        tokio::task::spawn_blocking(move || {
-            let _guard = guard;
-            // We tolerate ENOENT, because between planning eviction and executing
-            // it, the secondary downloader could have seen an updated heatmap that
-            // resulted in a layer being deleted.
-            // Other local I/O errors are process-fatal: these should never happen.
-            let deleted = std::fs::remove_file(path);
-
-            let not_found = deleted
-                .as_ref()
-                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
-
-            let deleted = if not_found {
-                false
-            } else {
-                deleted
-                    .map(|()| true)
-                    .fatal_err("Deleting layer during eviction")
-            };
-
-            if !deleted {
-                // skip updating accounting and putting perhaps later timestamp
-                return;
-            }
-
-            // Update the timeline's state.  This does not have to be synchronized with
-            // the download process, because:
-            // - If downloader is racing with us to remove a file (e.g. because it is
-            //   removed from heatmap), then our mutual .remove() operations will both
-            //   succeed.
-            // - If downloader is racing with us to download the object (this would require
-            //   multiple eviction iterations to race with multiple download iterations), then
-            //   if we remove it from the state, the worst that happens is the downloader
-            //   downloads it again before re-inserting, or we delete the file but it remains
-            //   in the state map (in which case it will be downloaded if this secondary
-            //   tenant transitions to attached and tries to access it)
-            //
-            // The important assumption here is that the secondary timeline state does not
-            // have to 100% match what is on disk, because it's a best-effort warming
-            // of the cache.
-            let mut detail = this.detail.lock().unwrap();
-            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                timeline_detail.on_disk_layers.remove(&name);
-                timeline_detail.evicted_at.insert(name, now);
-            }
-        })
-        .await
-        .expect("secondary eviction should not have panicked");
+        // Update the timeline's state.  This does not have to be synchronized with
+        // the download process, because:
+        // - If downloader is racing with us to remove a file (e.g. because it is
+        //   removed from heatmap), then our mutual .remove() operations will both
+        //   succeed.
+        // - If downloader is racing with us to download the object (this would require
+        //   multiple eviction iterations to race with multiple download iterations), then
+        //   if we remove it from the state, the worst that happens is the downloader
+        //   downloads it again before re-inserting, or we delete the file but it remains
+        //   in the state map (in which case it will be downloaded if this secondary
+        //   tenant transitions to attached and tries to access it)
+        //
+        // The important assumption here is that the secondary timeline state does not
+        // have to 100% match what is on disk, because it's a best-effort warming
+        // of the cache.
+        let mut detail = self.detail.lock().unwrap();
+        if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+            timeline_detail.on_disk_layers.remove(&name);
+            timeline_detail.evicted_at.insert(name, now);
+        }
    }
 }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -16,8 +16,7 @@ use crate::{
        config::SecondaryLocationConfig,
        debug_assert_current_span_has_tenant_and_timeline_id,
        remote_timeline_client::{
-            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
+            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
        },
        span::debug_assert_current_span_has_tenant_id,
        storage_layer::LayerFileName,
@@ -789,7 +788,7 @@ async fn init_timeline_state(
            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
            continue;
-        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
+        } else if crate::is_temporary(&file_path) {
            // Temporary files are frequently left behind from restarting during downloads
            tracing::info!("Cleaning up temporary file {file_path}");
            if let Err(e) = tokio::fs::remove_file(&file_path)
--- a/Show More
+++ b/Show More