Limit number of AUX files deltas to reduce reconstruct time

Bump Postgres version
Bu,p postgres versions
2026-05-23 16:10:37 +00:00 · 2024-02-22 08:57:09 +02:00 · 2024-02-21 21:32:14 +02:00 · 2024-02-21 21:30:46 +02:00 · 2024-02-21 21:30:46 +02:00
280 changed files with 4654 additions and 13444 deletions
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -16,9 +16,9 @@ assignees: ''

 ## Implementation ideas

-## Tasks
+
 ```[tasklist]
- [ ] Example Task
+### Tasks
 ```


--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -39,7 +39,7 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -19,7 +19,7 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -16,14 +16,8 @@ concurrency:
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}

 jobs:
-  check-permissions:
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
-    uses: ./.github/workflows/check-permissions.yml
-    with:
-      github-event-name: ${{ github.event_name}}
-
  actionlint:
-    needs: [ check-permissions ]
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -62,7 +62,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -214,7 +214,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -362,7 +362,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -461,7 +461,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -558,7 +558,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -1,105 +0,0 @@
-name: Build build-tools image
-
-on:
-  workflow_call:
-    inputs:
-      image-tag:
-        description: "build-tools image tag"
-        required: true
-        type: string
-    outputs:
-      image-tag:
-        description: "build-tools tag"
-        value: ${{ inputs.image-tag }}
-      image:
-        description: "build-tools image"
-        value: neondatabase/build-tools:${{ inputs.image-tag }}
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-concurrency:
-  group: build-build-tools-image-${{ inputs.image-tag }}
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-jobs:
-  check-image:
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
-  build-image:
-    needs: [ check-image ]
-    if: needs.check-image.outputs.found == 'false'
-
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
-
-    env:
-      IMAGE_TAG: ${{ inputs.image-tag }}
-
-    steps:
-      - name: Check `input.tag` is correct
-        env:
-          INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
-          CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
-        run: |
-          if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
-            echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
-            exit 1
-          fi
-
-      - uses: actions/checkout@v3
-
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p /tmp/.docker-custom
-          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/setup-buildx-action@v2
-
-      - uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/build-push-action@v4
-        with:
-          context: .
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.build-tools
-          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
-          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
-
-      - name: Remove custom docker config directory
-        run: |
-          rm -rf /tmp/.docker-custom
-
-  merge-images:
-    needs: [ build-image ]
-    runs-on: ubuntu-latest
-
-    env:
-      IMAGE_TAG: ${{ inputs.image-tag }}
-
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Create multi-arch image
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
-                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
-                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -0,0 +1,124 @@
+name: Build and Push Docker Image
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile-path:
+        required: true
+        type: string
+      image-name:
+        required: true
+        type: string
+    outputs:
+      build-tools-tag:
+        description: "tag generated for build tools"
+        value: ${{ jobs.tag.outputs.build-tools-tag }}
+
+jobs:
+  check-if-build-tools-dockerfile-changed:
+    runs-on: ubuntu-latest
+    outputs:
+      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
+    steps:
+      - name: Check if Dockerfile.buildtools has changed
+        id: dockerfile
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
+            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
+            exit
+          fi
+          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
+          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
+            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  tag:
+    runs-on: ubuntu-latest
+    needs: [ check-if-build-tools-dockerfile-changed ]
+    outputs:
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
+
+    steps:
+      - name: Get buildtools tag
+        env:
+          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
+  kaniko:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, x64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+
+  kaniko-arm:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, arm64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+  manifest:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    name: 'manifest'
+    runs-on: [ self-hosted, dev, x64 ]
+    needs:
+      - tag
+      - kaniko
+      - kaniko-arm
+      - check-if-build-tools-dockerfile-changed
+
+    steps:
+      - name: Create manifest
+        run: |
+          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+      - name: Push manifest
+        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - release-proxy
  pull_request:

 defaults:
@@ -28,9 +27,24 @@ env:
 jobs:
  check-permissions:
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
-    uses: ./.github/workflows/check-permissions.yml
-    with:
-      github-event-name: ${{ github.event_name}}
+    runs-on: ubuntu-latest
+    steps:
+    - name: Disallow PRs from forks
+      if: |
+        github.event_name == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1

  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
@@ -68,8 +82,6 @@ jobs:
            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
-          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
@@ -77,25 +89,19 @@ jobs:
        shell: bash
        id: build-tag

-  check-build-tools-image:
+  build-buildtools-image:
    needs: [ check-permissions ]
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  build-build-tools-image:
-    needs: [ check-build-tools-image ]
-    uses: ./.github/workflows/build-build-tools-image.yml
+    uses: ./.github/workflows/build_and_push_docker_image.yml
    with:
-      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+      dockerfile-path: Dockerfile.buildtools
+      image-name: build-tools
    secrets: inherit

  check-codestyle-python:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init

    steps:
@@ -124,13 +130,10 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init

    steps:
@@ -194,13 +197,10 @@ jobs:
        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions, tag, build-build-tools-image ]
+    needs: [ check-permissions, tag, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      # Raise locked memory limit for tokio-epoll-uring.
      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
      # io_uring will account the memory of the CQ and SQ as locked.
@@ -438,13 +438,10 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
@@ -488,13 +485,10 @@ jobs:
  get-benchmarks-durations:
    outputs:
      json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
@@ -521,13 +515,10 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -559,15 +550,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}

    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init

    steps:
@@ -608,13 +596,10 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests, build-build-tools-image ]
+    needs: [ check-permissions, regress-tests, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init
    strategy:
      fail-fast: false
@@ -712,146 +697,166 @@ jobs:
            })

  trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
    needs: [ check-permissions, promote-images, tag ]
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit

  neon-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
+    defaults:
+      run:
+        shell: sh -eu {0}

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1 # v3 won't work with kaniko
        with:
          submodules: true
          fetch-depth: 0

-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
+      - name: Configure ECR and Docker Hub login
        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"

-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      - name: Kaniko build neon
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}

-      - uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache
-          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
-          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr

-      - name: Remove custom docker config directory
-        if: always()
+  compute-tools-image:
+    runs-on: [ self-hosted, gen3, large ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
+    defaults:
+      run:
+        shell: sh -eu {0}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1 # v3 won't work with kaniko
+
+      - name: Configure ECR and Docker Hub login
        run: |
-          rm -rf .docker-custom
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
+
+      - name: Kaniko build compute tools
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+                           --dockerfile Dockerfile.compute-tools
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr

  compute-node-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
-
+    container:
+      image: gcr.io/kaniko-project/executor:v1.9.2-debug
+      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
+      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
+      options: --add-host=download.osgeo.org:140.211.15.30
    strategy:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
+    defaults:
+      run:
+        shell: sh -eu {0}

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1 # v3 won't work with kaniko
        with:
          submodules: true
          fetch-depth: 0

-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
+      - name: Configure ECR and Docker Hub login
        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
-        with:
-          # Disable parallelism for docker buildkit.
-          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          config-inline: |
-            [worker.oci]
-              max-parallelism = 1
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"

-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      - name: Kaniko build compute node with extensions
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg PG_VERSION=${{ matrix.version }}
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+                           --dockerfile Dockerfile.compute-node
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --cleanup

-      - name: Build compute-node image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            PG_VERSION=${{ matrix.version }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
-          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-
-      - name: Build compute-tools image
-        # compute-tools are Postgres independent, so build it only once
-        if: ${{ matrix.version == 'v16' }}
-        uses: docker/build-push-action@v5
-        with:
-          target: compute-tools-image
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-node
-          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr

  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
@@ -895,7 +900,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ check-permissions, tag, neon-image, compute-node-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -929,8 +934,7 @@ jobs:
          fi

      - name: Verify docker-compose example
-        timeout-minutes: 20
-        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh

      - name: Print logs and clean up
        if: always()
@@ -963,7 +967,9 @@ jobs:
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
-        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+           github.event_name != 'workflow_dispatch'
        run: |
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -975,7 +981,9 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+           github.event_name != 'workflow_dispatch'
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -999,7 +1007,9 @@ jobs:
          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -1089,7 +1099,7 @@ jobs:

  deploy:
    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
-    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'

    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1124,26 +1134,14 @@ jobs:
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-              -f deployPgSniRouter=false \
-              -f deployProxy=false \
-              -f deployStorage=true \
-              -f deployStorageBroker=true \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}}
-          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
-              -f deployPgSniRouter=true \
-              -f deployProxy=true \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
          fi

      - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'
        uses: actions/github-script@v7
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -1156,7 +1154,6 @@ jobs:
              sha: context.sha,
            })

-      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
      - name: Create GitHub release
        if: github.ref_name == 'release'
        uses: actions/github-script@v7
@@ -1208,11 +1205,3 @@ jobs:

            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
          done
-
-  pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images, regress-tests ]
-    if: github.ref_name == 'main'
-    uses: ./.github/workflows/pin-build-tools-image.yml
-    with:
-      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
-    secrets: inherit
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -1,58 +0,0 @@
-name: Check build-tools image
-
-on:
-  workflow_call:
-    outputs:
-      image-tag:
-        description: "build-tools image tag"
-        value: ${{ jobs.check-image.outputs.tag }}
-      found:
-        description: "Whether the image is found in the registry"
-        value: ${{ jobs.check-image.outputs.found }}
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-jobs:
-  check-image:
-    runs-on: ubuntu-latest
-    outputs:
-      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
-      found: ${{ steps.check-image.outputs.found }}
-
-    steps:
-      - name: Get build-tools image tag for the current commit
-        id: get-build-tools-tag
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          LAST_BUILD_TOOLS_SHA=$(
-            gh api \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" \
-              --method GET \
-              --field path=Dockerfile.build-tools \
-              --field sha=${COMMIT_SHA} \
-              --field per_page=1 \
-              --jq ".[0].sha" \
-              "/repos/${GITHUB_REPOSITORY}/commits"
-          )
-          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
-
-      - name: Check if such tag found in the registry
-        id: check-image
-        env:
-          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
-        run: |
-          if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
-            found=true
-          else
-            found=false
-          fi
-
-          echo "found=${found}" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -1,36 +0,0 @@
-name: Check Permissions
-
-on:
-  workflow_call:
-    inputs:
-      github-event-name:
-        required: true
-        type: string
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-jobs:
-  check-permissions:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Disallow CI runs on PRs from forks
-      if: |
-        inputs.github-event-name  == 'pull_request' &&
-        github.event.pull_request.head.repo.full_name != github.repository
-      run: |
-        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
-          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
-        else
-          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
-        fi
-
-        # TODO: use actions/github-script to post this message as a PR comment
-        echo >&2 "We don't run CI for PRs from forks"
-        echo >&2 "${MESSAGE}"
-
-        exit 1
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -1,32 +0,0 @@
-# A workflow from
-# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
-
-name: cleanup caches by a branch
-on:
-  pull_request:
-    types:
-      - closed
-
-jobs:
-  cleanup:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Cleanup
-        run: |
-          gh extension install actions/gh-actions-cache
-
-          echo "Fetching list of cache key"
-          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
-
-          ## Setting this to not fail the workflow while deleting cache keys.
-          set +e
-          echo "Deleting caches..."
-          for cacheKey in $cacheKeysForPR
-          do
-              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
-          done
-          echo "Done"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO: ${{ github.repository }}
-          BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -20,25 +20,7 @@ env:
  COPT: '-Werror'

 jobs:
-  check-permissions:
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
-    uses: ./.github/workflows/check-permissions.yml
-    with:
-      github-event-name: ${{ github.event_name}}
-
-  check-build-tools-image:
-    needs: [ check-permissions ]
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  build-build-tools-image:
-    needs: [ check-build-tools-image ]
-    uses: ./.github/workflows/build-build-tools-image.yml
-    with:
-      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
-    secrets: inherit
-
  check-macos-build:
-    needs: [ check-permissions ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -134,8 +116,8 @@ jobs:
        run: ./run_clippy.sh

  check-linux-arm-build:
-    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    runs-on: [ self-hosted, dev, arm64 ]

    env:
@@ -148,10 +130,7 @@ jobs:
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -258,15 +237,12 @@ jobs:
          cargo nextest run --package remote_storage --test test_real_azure

  check-codestyle-rust-arm:
-    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    runs-on: [ self-hosted, dev, arm64 ]

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -333,17 +309,13 @@ jobs:
        run: cargo deny check

  gather-rust-build-stats:
-    needs: [ check-permissions, build-build-tools-image ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    env:
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -1,72 +0,0 @@
-name: 'Pin build-tools image'
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-  workflow_call:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-concurrency:
-  group: pin-build-tools-image-${{ inputs.from-tag }}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: ubuntu-latest
-
-    env:
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: pinned
-
-    steps:
-      - name: Check if we really need to pin the image
-        id: check-manifests
-        run: |
-          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
-          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
-
-          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
-            skip=true
-          else
-            skip=false
-          fi
-
-          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
-
-      - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
-
-      - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,31 +2,12 @@ name: Create Release Branch

 on:
  schedule:
-    # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * MON' # Storage release
-    - cron: '0 6 * * THU' # Proxy release
+    - cron: '0 6 * * 1'
  workflow_dispatch:
-    inputs:
-      create-storage-release-branch:
-        type: boolean
-        description: 'Create Storage release PR'
-        required: false
-      create-proxy-release-branch:
-        type: boolean
-        description: 'Create Proxy release PR'
-        required: false
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}

 jobs:
-  create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
+  create_release_branch:
+    runs-on: [ ubuntu-latest ]

    permissions:
      contents: write # for `git push`
@@ -37,67 +18,27 @@ jobs:
      with:
        ref: main

-    - name: Set environment variables
-      run: |
-        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-        echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+    - name: Get current date
+      id: date
+      run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT

    - name: Create release branch
-      run: git checkout -b $RELEASE_BRANCH
+      run: git checkout -b releases/${{ steps.date.outputs.date }}

    - name: Push new branch
-      run: git push origin $RELEASE_BRANCH
+      run: git push origin releases/${{ steps.date.outputs.date }}

    - name: Create pull request into release
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Release ${RELEASE_DATE}
+          ## Release ${{ steps.date.outputs.date }}

-          **Please merge this Pull Request using 'Create a merge commit' button**
+          **Please merge this PR using 'Create a merge commit'!**
        EOF

-        gh pr create --title "Release ${RELEASE_DATE}" \
+        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
                     --body-file "body.md" \
-                     --head "${RELEASE_BRANCH}" \
+                     --head "releases/${{ steps.date.outputs.date }}" \
                     --base "release"
-
-  create-proxy-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: write # for `git push`
-
-    steps:
-    - name: Check out code
-      uses: actions/checkout@v4
-      with:
-        ref: main
-
-    - name: Set environment variables
-      run: |
-        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-        echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-
-    - name: Create release branch
-      run: git checkout -b $RELEASE_BRANCH
-
-    - name: Push new branch
-      run: git push origin $RELEASE_BRANCH
-
-    - name: Create pull request into release
-      env:
-        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-      run: |
-        cat << EOF > body.md
-          ## Proxy release ${RELEASE_DATE}
-
-          **Please merge this Pull Request using 'Create a merge commit' button**
-        EOF
-
-        gh pr create --title "Proxy release ${RELEASE_DATE}" \
-                     --body-file "body.md" \
-                     --head "${RELEASE_BRANCH}" \
-                     --base "release-proxy"
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,8 +51,6 @@ jobs:
            echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
-          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -0,0 +1,70 @@
+name: 'Update build tools image tag'
+
+# This workflow it used to update tag of build tools in ECR.
+# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+      to-tag:
+        description: 'Destination tag'
+        required: true
+        type: string
+        default: 'pinned'
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: [ self-hosted, gen3, small ]
+
+    env:
+      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+
+    steps:
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v2
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install crane
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
+
+      - name: Copy images
+        run: |
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,6 @@ test_output/
 neon.iml
 /.neon
 /integration_tests/.neon
-compaction-suite-results.*

 # Coverage
 *.profraw
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -74,11 +74,16 @@ We're using the following approach to make it work:

 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)

-## How do I make build-tools image "pinned"
+## How do I add the "pinned" tag to an buildtools image?
+We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.

-It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
+You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
+or using GitHub CLI:

 ```bash
-gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
-            -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
-```
+gh workflow -R neondatabase/neon run update_build_tools_image.yml \
+            -f from-tag=6254913013 \
+            -f to-tag=pinned \
+
+# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
+```
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"

 [[package]]
 name = "ahash"
-version = "0.8.9"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
+checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
 dependencies = [
 "cfg-if",
 "const-random",
@@ -1389,9 +1389,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
 dependencies = [
 "rustc_version",
 ]
@@ -2959,9 +2959,9 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
 dependencies = [
 "libc",
 "log",
@@ -3498,7 +3498,6 @@ dependencies = [
 "num_cpus",
 "once_cell",
 "pageserver_api",
- "pageserver_compaction",
 "pin-project-lite",
 "postgres",
 "postgres-protocol",
@@ -3553,7 +3552,6 @@ dependencies = [
 "const_format",
 "enum-map",
 "hex",
- "humantime",
 "humantime-serde",
 "itertools",
 "postgres_ffi",
@@ -3589,53 +3587,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "pageserver_compaction"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-compression",
- "async-stream",
- "async-trait",
- "byteorder",
- "bytes",
- "chrono",
- "clap",
- "const_format",
- "consumption_metrics",
- "criterion",
- "crossbeam-utils",
- "either",
- "fail",
- "flate2",
- "futures",
- "git-version",
- "hex",
- "hex-literal",
- "humantime",
- "humantime-serde",
- "itertools",
- "metrics",
- "once_cell",
- "pageserver_api",
- "pin-project-lite",
- "rand 0.8.5",
- "smallvec",
- "svg_fmt",
- "sync_wrapper",
- "thiserror",
- "tokio",
- "tokio-io-timeout",
- "tokio-util",
- "tracing",
- "tracing-error",
- "tracing-subscriber",
- "url",
- "utils",
- "walkdir",
- "workspace_hack",
-]
-
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4216,6 +4167,7 @@ dependencies = [
 "thiserror",
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
+ "tls-listener",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
@@ -5794,10 +5746,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
-name = "tokio"
-version = "1.36.0"
+name = "tls-listener"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd"
+dependencies = [
+ "futures-util",
+ "hyper",
+ "pin-project-lite",
+ "thiserror",
+ "tokio",
+ "tokio-rustls",
+]
+
+[[package]]
+name = "tokio"
+version = "1.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
 dependencies = [
 "backtrace",
 "bytes",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,6 @@ members = [
    "control_plane",
    "control_plane/attachment_service",
    "pageserver",
-    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
@@ -156,6 +155,7 @@ test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
+tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
@@ -199,7 +199,6 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
-pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -786,22 +786,6 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control

-#########################################################################################
-#
-# Layer "pg_partman"
-# compile pg_partman extension
-#
-#########################################################################################
-FROM build-deps AS pg-partman-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
-    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control

 #########################################################################################
 #
@@ -845,7 +829,6 @@ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -891,17 +874,7 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Final compute-tools image
-#
-#########################################################################################
-
-FROM debian:bullseye-slim AS compute-tools-image
-
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
+RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

 #########################################################################################
 #
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -0,0 +1,32 @@
+# First transient image to build compute_tools binaries
+# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
+ARG REPOSITORY=neondatabase
+ARG IMAGE=build-tools
+ARG TAG=pinned
+ARG BUILD_TAG
+
+FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
+WORKDIR /home/nonroot
+
+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
+ARG RUSTC_WRAPPER=cachepot
+ENV AWS_REGION=eu-central-1
+ENV CACHEPOT_S3_KEY_PREFIX=cachepot
+ARG CACHEPOT_BUCKET=neon-github-dev
+#ARG AWS_ACCESS_KEY_ID
+#ARG AWS_SECRET_ACCESS_KEY
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG
+
+COPY . .
+
+RUN set -e \
+    && mold -run cargo build -p compute_tools --locked --release \
+    && cachepot -s
+
+# Final image that only has one binary
+FROM debian:bullseye-slim
+
+COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

 ## Quick start
-Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.

 Alternatively, compile and run the project [locally](#running-local-installation).

@@ -230,12 +230,6 @@ postgres=# select * from t;
 > cargo neon stop
 ```

-More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
-
-#### Handling build failures
-
-If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
-
 ## Running tests

 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -265,12 +259,6 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 > It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
 > See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).

-## Cleanup
-
-For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
-
-For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
-
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/clippy.toml
+++ b/clippy.toml
@@ -3,10 +3,3 @@ disallowed-methods = [
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
 ]
-
-disallowed-macros = [
-    # use std::pin::pin
-    "futures::pin_mut",
-    # cannot disallow this, because clippy finds used from tokio macros
-    #"tokio::pin",
-]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,8 +17,9 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
-use postgres::error::SqlState;
 use postgres::{Client, NoTls};
+use tokio;
+use tokio_postgres;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -764,26 +765,6 @@ impl ComputeNode {
        Ok((pg, logs_handle))
    }

-    /// Do post configuration of the already started Postgres. This function spawns a background thread to
-    /// configure the database after applying the compute spec. Currently, it upgrades the neon extension
-    /// version. In the future, it may upgrade all 3rd-party extensions.
-    #[instrument(skip_all)]
-    pub fn post_apply_config(&self) -> Result<()> {
-        let connstr = self.connstr.clone();
-        thread::spawn(move || {
-            let func = || {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
-                handle_neon_extension_upgrade(&mut client)
-                    .context("handle_neon_extension_upgrade")?;
-                Ok::<_, anyhow::Error>(())
-            };
-            if let Err(err) = func() {
-                error!("error while post_apply_config: {err:#}");
-            }
-        });
-        Ok(())
-    }
-
    /// Do initial configuration of the already started Postgres.
    #[instrument(skip_all)]
    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
@@ -795,34 +776,27 @@ impl ComputeNode {
        // but we can create a new one and grant it all privileges.
        let connstr = self.connstr.clone();
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
-            Err(e) => match e.code() {
-                Some(&SqlState::INVALID_PASSWORD)
-                | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
-                    // connect with zenith_admin if cloud_admin could not authenticate
-                    info!(
-                        "cannot connect to postgres: {}, retrying with `zenith_admin` username",
-                        e
-                    );
-                    let mut zenith_admin_connstr = connstr.clone();
+            Err(e) => {
+                info!(
+                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
+                    e
+                );
+                let mut zenith_admin_connstr = connstr.clone();

-                    zenith_admin_connstr
-                        .set_username("zenith_admin")
-                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                zenith_admin_connstr
+                    .set_username("zenith_admin")
+                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;

-                    let mut client =
-                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
-                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
-                    // Disable forwarding so that users don't get a cloud_admin role
-                    client.simple_query("SET neon.forward_ddl = false")?;
-                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                    drop(client);
+                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
+                // Disable forwarding so that users don't get a cloud_admin role
+                client.simple_query("SET neon.forward_ddl = false")?;
+                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                drop(client);

-                    // reconnect with connstring with expected name
-                    Client::connect(connstr.as_str(), NoTls)?
-                }
-                _ => return Err(e.into()),
-            },
+                // reconnect with connstring with expected name
+                Client::connect(connstr.as_str(), NoTls)?
+            }
            Ok(client) => client,
        };

@@ -1018,21 +992,18 @@ impl ComputeNode {
        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary {
-            if !pspec.spec.skip_pg_catalog_updates {
-                let pgdata_path = Path::new(&self.pgdata);
-                // temporarily reset max_cluster_size in config
-                // to avoid the possibility of hitting the limit, while we are applying config:
-                // creating new extensions, roles, etc...
-                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-                self.pg_reload_conf()?;
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+            self.pg_reload_conf()?;

-                self.apply_config(&compute_state)?;
+            self.apply_config(&compute_state)?;

-                config::compute_ctl_temp_override_remove(pgdata_path)?;
-                self.pg_reload_conf()?;
-            }
-            self.post_apply_config()?;
+            config::compute_ctl_temp_override_remove(pgdata_path)?;
+            self.pg_reload_conf()?;
        }

        let startup_end_time = Utc::now();
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -82,12 +82,6 @@ pub fn write_postgres_conf(
        ComputeMode::Replica => {
            // hot_standby is 'on' by default, but let's be explicit
            writeln!(file, "hot_standby=on")?;
-
-            // Inform the replica about the primary state
-            // Default is 'false'
-            if let Some(primary_is_running) = spec.primary_is_running {
-                writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
-            }
        }
    }

--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::Result;
+use anyhow::{self, Result};
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,6 +13,8 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use num_cpus;
+use serde_json;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -655,9 +655,6 @@ pub fn handle_grants(
        // remove this code if possible. The worst thing that could happen is that
        // user won't be able to use public schema in NEW databases created in the
        // very OLD project.
-        //
-        // Also, alter default permissions so that relations created by extensions can be
-        // used by neon_superuser without permission issues.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
@@ -676,15 +673,6 @@ pub fn handle_grants(
                            GRANT CREATE ON SCHEMA public TO web_access;\n\
                        END IF;\n\
                    END IF;\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    )\n\
-                    THEN\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
-                    END IF;\n\
                END\n\
            $$;"
        .to_string();
@@ -744,17 +732,7 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // - extension was just installed
    // - extension was already installed and is up to date
    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
-
-    Ok(())
-}
-
-#[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade");
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
+    info!("update neon extension schema with query: {}", query);
    client.simple_query(query)?;

    Ok(())
@@ -799,12 +777,6 @@ BEGIN
 END
 $$;"#,
        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
-        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
-        "",
-        "",
-        "",
-        "",
-        // Add new migrations below.
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -831,13 +803,8 @@ $$;"#,
    client.simple_query(query)?;

    while current_migration < migrations.len() {
-        let migration = &migrations[current_migration];
-        if migration.is_empty() {
-            info!("Skip migration id={}", current_migration);
-        } else {
-            info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration)?;
-        }
+        info!("Running migration:\n{}\n", migrations[current_migration]);
+        client.simple_query(migrations[current_migration])?;
        current_migration += 1;
    }
    let setval = format!(
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -1,26 +0,0 @@
-# Control Plane and Neon Local
-
-This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
-
-## Example: Start with Postgres 16
-
-To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
-
-```shell
-cargo neon init --pg-version 16
-cargo neon start
-cargo neon tenant create --set-default --pg-version 16
-cargo neon endpoint create main --pg-version 16
-cargo neon endpoint start main
-```
-
-## Example: Create Test User and Database
-
-By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
-
-```shell
-cargo neon endpoint create main --pg-version 16 --update-catalog true
-cargo neon endpoint start main --create-test-user true
-```
-
-The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
@@ -1,2 +0,0 @@
-ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
-ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
@@ -1,4 +0,0 @@
-
-
-ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
-ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
@@ -1,9 +0,0 @@
-use utils::auth::{AuthError, Claims, Scope};
-
-pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
-    if claims.scope != required_scope {
-        return Err(AuthError("Scope mismatch. Permission denied".into()));
-    }
-
-    Ok(())
-}
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
+use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -19,66 +19,8 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

 pub(crate) const API_CONCURRENCY: usize = 32;

-struct ShardedComputeHookTenant {
-    stripe_size: ShardStripeSize,
-    shard_count: ShardCount,
-    shards: Vec<(ShardNumber, NodeId)>,
-}
-
-enum ComputeHookTenant {
-    Unsharded(NodeId),
-    Sharded(ShardedComputeHookTenant),
-}
-
-impl ComputeHookTenant {
-    /// Construct with at least one shard's information
-    fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
-        if tenant_shard_id.shard_count.count() > 1 {
-            Self::Sharded(ShardedComputeHookTenant {
-                shards: vec![(tenant_shard_id.shard_number, node_id)],
-                stripe_size,
-                shard_count: tenant_shard_id.shard_count,
-            })
-        } else {
-            Self::Unsharded(node_id)
-        }
-    }
-
-    /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
-    /// and drops existing content.
-    fn update(
-        &mut self,
-        tenant_shard_id: TenantShardId,
-        stripe_size: ShardStripeSize,
-        node_id: NodeId,
-    ) {
-        match self {
-            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
-                *existing_node_id = node_id
-            }
-            Self::Sharded(sharded_tenant)
-                if sharded_tenant.stripe_size == stripe_size
-                    && sharded_tenant.shard_count == tenant_shard_id.shard_count =>
-            {
-                if let Some(existing) = sharded_tenant
-                    .shards
-                    .iter()
-                    .position(|s| s.0 == tenant_shard_id.shard_number)
-                {
-                    sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
-                } else {
-                    sharded_tenant
-                        .shards
-                        .push((tenant_shard_id.shard_number, node_id));
-                    sharded_tenant.shards.sort_by_key(|s| s.0)
-                }
-            }
-            _ => {
-                // Shard count changed: reset struct.
-                *self = Self::new(tenant_shard_id, stripe_size, node_id);
-            }
-        }
-    }
+pub(super) struct ComputeHookTenant {
+    shards: Vec<(ShardIndex, NodeId)>,
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -91,7 +33,6 @@ struct ComputeHookNotifyRequestShard {
 #[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
-    stripe_size: Option<ShardStripeSize>,
    shards: Vec<ComputeHookNotifyRequestShard>,
 }

@@ -122,43 +63,42 @@ pub(crate) enum NotifyError {
 }

 impl ComputeHookTenant {
-    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        match self {
-            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
-                tenant_id,
-                shards: vec![ComputeHookNotifyRequestShard {
-                    shard_number: ShardNumber(0),
-                    node_id: *node_id,
-                }],
-                stripe_size: None,
-            }),
-            Self::Sharded(sharded_tenant)
-                if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
-            {
-                Some(ComputeHookNotifyRequest {
-                    tenant_id,
-                    shards: sharded_tenant
-                        .shards
-                        .iter()
-                        .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
-                            shard_number: *shard_number,
-                            node_id: *node_id,
-                        })
-                        .collect(),
-                    stripe_size: Some(sharded_tenant.stripe_size),
-                })
-            }
-            Self::Sharded(sharded_tenant) => {
-                // Sharded tenant doesn't yet have information for all its shards
+    async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        // Find the highest shard count and drop any shards that aren't
+        // for that shard count.
+        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
+        let Some(shard_count) = shard_count else {
+            // No shards, nothing to do.
+            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
+            return None;
+        };

-                tracing::info!(
-                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
-                    sharded_tenant.shards.len(),
-                    sharded_tenant.shard_count.count()
-                );
-                None
-            }
+        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
+        self.shards
+            .sort_by_key(|(shard, _node_id)| shard.shard_number);
+
+        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
+            // We have pageservers for all the shards: emit a configuration update
+            return Some(ComputeHookNotifyRequest {
+                tenant_id,
+                shards: self
+                    .shards
+                    .iter()
+                    .map(|(shard, node_id)| ComputeHookNotifyRequestShard {
+                        shard_number: shard.shard_number,
+                        node_id: *node_id,
+                    })
+                    .collect(),
+            });
+        } else {
+            tracing::info!(
+                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                self.shards.len(),
+                shard_count.count()
+            );
        }
+
+        None
    }
 }

@@ -199,11 +139,7 @@ impl ComputeHook {
        };
        let cplane =
            ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
-        let ComputeHookNotifyRequest {
-            tenant_id,
-            shards,
-            stripe_size,
-        } = reconfigure_request;
+        let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;

        let compute_pageservers = shards
            .into_iter()
@@ -220,9 +156,7 @@ impl ComputeHook {
        for (endpoint_name, endpoint) in &cplane.endpoints {
            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
-                endpoint
-                    .reconfigure(compute_pageservers.clone(), stripe_size)
-                    .await?;
+                endpoint.reconfigure(compute_pageservers.clone()).await?;
            }
        }

@@ -337,26 +271,30 @@ impl ComputeHook {
        &self,
        tenant_shard_id: TenantShardId,
        node_id: NodeId,
-        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
        let mut locked = self.state.lock().await;
+        let entry = locked
+            .entry(tenant_shard_id.tenant_id)
+            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });

-        use std::collections::hash_map::Entry;
-        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
-            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                tenant_shard_id,
-                stripe_size,
-                node_id,
-            )),
-            Entry::Occupied(e) => {
-                let tenant = e.into_mut();
-                tenant.update(tenant_shard_id, stripe_size, node_id);
-                tenant
-            }
+        let shard_index = ShardIndex {
+            shard_count: tenant_shard_id.shard_count,
+            shard_number: tenant_shard_id.shard_number,
        };

-        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
+        let mut set = false;
+        for (existing_shard, existing_node) in &mut entry.shards {
+            if *existing_shard == shard_index {
+                *existing_node = node_id;
+                set = true;
+            }
+        }
+        if !set {
+            entry.shards.push((shard_index, node_id));
+        }
+
+        let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
        let Some(reconfigure_request) = reconfigure_request else {
            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
            // until it does.
@@ -378,85 +316,3 @@ impl ComputeHook {
        }
    }
 }
-
-#[cfg(test)]
-pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
-    use utils::id::TenantId;
-
-    use super::*;
-
-    #[test]
-    fn tenant_updates() -> anyhow::Result<()> {
-        let tenant_id = TenantId::generate();
-        let mut tenant_state = ComputeHookTenant::new(
-            TenantShardId {
-                tenant_id,
-                shard_count: ShardCount::new(0),
-                shard_number: ShardNumber(0),
-            },
-            ShardStripeSize(12345),
-            NodeId(1),
-        );
-
-        // An unsharded tenant is always ready to emit a notification
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .shards
-                .len(),
-            1
-        );
-        assert!(tenant_state
-            .maybe_reconfigure(tenant_id)
-            .unwrap()
-            .stripe_size
-            .is_none());
-
-        // Writing the first shard of a multi-sharded situation (i.e. in a split)
-        // resets the tenant state and puts it in an non-notifying state (need to
-        // see all shards)
-        tenant_state.update(
-            TenantShardId {
-                tenant_id,
-                shard_count: ShardCount::new(2),
-                shard_number: ShardNumber(1),
-            },
-            ShardStripeSize(32768),
-            NodeId(1),
-        );
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
-
-        // Writing the second shard makes it ready to notify
-        tenant_state.update(
-            TenantShardId {
-                tenant_id,
-                shard_count: ShardCount::new(2),
-                shard_number: ShardNumber(0),
-            },
-            ShardStripeSize(32768),
-            NodeId(1),
-        );
-
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .shards
-                .len(),
-            2
-        );
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .stripe_size,
-            Some(ShardStripeSize(32768))
-        );
-
-        Ok(())
-    }
-}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,18 +1,17 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use utils::auth::{Scope, SwappableJwtAuth};
-use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
+use utils::auth::SwappableJwtAuth;
+use utils::http::endpoint::{auth_middleware, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

@@ -26,12 +25,12 @@ use utils::{
    id::NodeId,
 };

-use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
-};
-use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
+use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};

-use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
+use control_plane::attachment_service::{
+    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
+    TenantShardMigrateRequest,
+};

 /// State available to HTTP request handlers
 #[derive(Clone)]
@@ -65,18 +64,21 @@ fn get_state(request: &Request<Body>) -> &HttpState {

 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::GenerationsApi)?;
-
    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .re_attach(reattach_req)
+            .await
+            .map_err(ApiError::InternalServerError)?,
+    )
 }

 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::GenerationsApi)?;
-
    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -86,8 +88,6 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
 async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
    let state = get_state(&req);

@@ -102,8 +102,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 }

 async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let inspect_req = json_request::<InspectRequest>(&mut req).await?;

    let state = get_state(&req);
@@ -115,17 +113,10 @@ async fn handle_tenant_create(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-
-    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-    // have no expectation of HA).
-    let placement_policy = PlacementPolicy::Single;
-
    json_response(
        StatusCode::CREATED,
-        service.tenant_create(create_req, placement_policy).await?,
+        service.tenant_create(create_req).await?,
    )
 }

@@ -180,8 +171,6 @@ async fn handle_tenant_location_config(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
    json_response(
        StatusCode::OK,
@@ -191,34 +180,11 @@ async fn handle_tenant_location_config(
    )
 }

-async fn handle_tenant_config_set(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
-
-    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
-}
-
-async fn handle_tenant_config_get(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
-}
-
 async fn handle_tenant_time_travel_remote_storage(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;

    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -243,15 +209,7 @@ async fn handle_tenant_time_travel_remote_storage(
            done_if_after_raw,
        )
        .await?;
-    json_response(StatusCode::OK, ())
-}

-async fn handle_tenant_secondary_download(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    service.tenant_secondary_download(tenant_id).await?;
    json_response(StatusCode::OK, ())
 }

@@ -260,7 +218,6 @@ async fn handle_tenant_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;

    deletion_wrapper(service, move |service| async move {
        service.tenant_delete(tenant_id).await
@@ -273,8 +230,6 @@ async fn handle_tenant_timeline_create(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
    json_response(
        StatusCode::CREATED,
@@ -289,8 +244,6 @@ async fn handle_tenant_timeline_delete(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    deletion_wrapper(service, move |service| async move {
@@ -304,7 +257,6 @@ async fn handle_tenant_timeline_passthrough(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;

    let Some(path) = req.uri().path_and_query() else {
        // This should never happen, our request router only calls us if there is a path
@@ -348,15 +300,11 @@ async fn handle_tenant_locate(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
    let state = get_state(&req);
    state.service.node_register(register_req).await?;
@@ -364,23 +312,17 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 }

 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }

 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
    if node_id != config_req.node_id {
@@ -390,18 +332,13 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    }
    let state = get_state(&req);

-    json_response(
-        StatusCode::OK,
-        state.service.node_configure(config_req).await?,
-    )
+    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }

 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;

@@ -415,8 +352,6 @@ async fn handle_tenant_shard_migrate(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
    json_response(
@@ -429,30 +364,22 @@ async fn handle_tenant_shard_migrate(

 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }

 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    state.service.tenants_dump()
 }

 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);
    state.service.scheduler_dump()
 }

 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
    let state = get_state(&req);

    json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -509,12 +436,6 @@ where
    .await
 }

-fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
-    check_permission_with(request, |claims| {
-        crate::auth::check_permission(claims, required_scope)
-    })
-}
-
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
@@ -586,21 +507,12 @@ pub fn make_router(
        .delete("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(r, handle_tenant_delete)
        })
-        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set)
-        })
-        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get)
-        })
        .put("/v1/tenant/:tenant_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
-        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(r, handle_tenant_secondary_download)
-        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(r, handle_tenant_timeline_delete)
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,7 +1,6 @@
 use serde::{Deserialize, Serialize};
 use utils::seqwait::MonotonicCounter;

-mod auth;
 mod compute_hook;
 pub mod http;
 pub mod metrics;
@@ -13,20 +12,14 @@ mod schema;
 pub mod service;
 mod tenant_state;

-#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
+#[derive(Clone, Serialize, Deserialize, Debug)]
 enum PlacementPolicy {
    /// Cheapest way to attach a tenant: just one pageserver, no secondary
    Single,
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
-    /// Create one secondary mode locations. This is useful when onboarding
-    /// a tenant, or for an idle tenant that we might want to bring online quickly.
-    Secondary,
-
-    /// Do not attach to any pageservers.  This is appropriate for tenants that
-    /// have been idle for a long time, where we do not mind some delay in making
-    /// them available in future.
+    /// Do not attach to any pageservers
    Detached,
 }

--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
-use aws_config::{BehaviorVersion, Region};
+use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -79,38 +79,13 @@ impl Secrets {
        "neon-storage-controller-control-plane-jwt-token";
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";

-    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
-    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
-    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
-    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
-
-    /// Load secrets from, in order of preference:
-    /// - CLI args if database URL is provided on the CLI
-    /// - Environment variables if DATABASE_URL is set.
-    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
        match &args.database_url {
            Some(url) => Self::load_cli(url, args),
-            None => match std::env::var(Self::DATABASE_URL_ENV) {
-                Ok(database_url) => Self::load_env(database_url),
-                Err(_) => Self::load_aws_sm().await,
-            },
+            None => Self::load_aws_sm().await,
        }
    }

-    fn load_env(database_url: String) -> anyhow::Result<Self> {
-        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
-            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
-            Err(_) => None,
-        };
-        Ok(Self {
-            database_url,
-            public_key,
-            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
-            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
-        })
-    }
-
    async fn load_aws_sm() -> anyhow::Result<Self> {
        let Ok(region) = std::env::var("AWS_REGION") else {
            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,4 +1,4 @@
-use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use serde::Serialize;
 use utils::id::NodeId;

@@ -10,7 +10,7 @@ use crate::persistence::NodePersistence;
 ///
 /// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
 /// implementation of serialization on this type is only for debug dumps.
-#[derive(Clone, Serialize)]
+#[derive(Clone, Serialize, Eq, PartialEq)]
 pub(crate) struct Node {
    pub(crate) id: NodeId,

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,12 +6,10 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use diesel::pg::PgConnection;
-use diesel::{
-    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
-    Selectable, SelectableHelper,
-};
-use pageserver_api::controller_api::NodeSchedulingPolicy;
+use diesel::prelude::*;
+use diesel::Connection;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
@@ -132,10 +130,24 @@ impl Persistence {
    }

    /// At startup, populate the list of nodes which our shards may be placed on
-    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
-        let nodes: Vec<NodePersistence> = self
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
+        let nodes: Vec<Node> = self
            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                Ok(crate::schema::nodes::table
+                    .load::<NodePersistence>(conn)?
+                    .into_iter()
+                    .map(|n| Node {
+                        id: NodeId(n.node_id as u64),
+                        // At startup we consider a node offline until proven otherwise.
+                        availability: NodeAvailability::Offline,
+                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                            .expect("Bad scheduling policy in DB"),
+                        listen_http_addr: n.listen_http_addr,
+                        listen_http_port: n.listen_http_port as u16,
+                        listen_pg_addr: n.listen_pg_addr,
+                        listen_pg_port: n.listen_pg_port as u16,
+                    })
+                    .collect::<Vec<Node>>())
            })
            .await?;

@@ -144,31 +156,6 @@ impl Persistence {
        Ok(nodes)
    }

-    pub(crate) async fn update_node(
-        &self,
-        input_node_id: NodeId,
-        input_scheduling: NodeSchedulingPolicy,
-    ) -> DatabaseResult<()> {
-        use crate::schema::nodes::dsl::*;
-        let updated = self
-            .with_conn(move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                    .execute(conn)?;
-                Ok(updated)
-            })
-            .await?;
-
-        if updated != 1 {
-            Err(DatabaseError::Logical(format!(
-                "Node {node_id:?} not found for update",
-            )))
-        } else {
-            Ok(())
-        }
-    }
-
    /// At startup, load the high level state for shards, such as their config + policy.  This will
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -333,15 +320,7 @@ impl Persistence {
                shard_number: ShardNumber(tsp.shard_number as u8),
                shard_count: ShardCount::new(tsp.shard_count as u8),
            };
-
-            let Some(g) = tsp.generation else {
-                // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
-                // we only set generation_pageserver when setting generation.
-                return Err(DatabaseError::Logical(
-                    "Generation should always be set after incrementing".to_string(),
-                ));
-            };
-            result.insert(tenant_shard_id, Generation::new(g as u32));
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
        }

        Ok(result)
@@ -374,85 +353,7 @@ impl Persistence {
            })
            .await?;

-        // Generation is always non-null in the rseult: if the generation column had been NULL, then we
-        // should have experienced an SQL Confilict error while executing a query that tries to increment it.
-        debug_assert!(updated.generation.is_some());
-        let Some(g) = updated.generation else {
-            return Err(DatabaseError::Logical(
-                "Generation should always be set after incrementing".to_string(),
-            )
-            .into());
-        };
-
-        Ok(Generation::new(g as u32))
-    }
-
-    /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
-    ///
-    /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
-    /// API: use [`Self::increment_generation`] instead.  Setting the generation via this route is a one-time thing
-    /// that we only do the first time a tenant is set to an attached policy via /location_config.
-    pub(crate) async fn update_tenant_shard(
-        &self,
-        tenant_shard_id: TenantShardId,
-        input_placement_policy: PlacementPolicy,
-        input_config: TenantConfig,
-        input_generation: Option<Generation>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_conn(move |conn| {
-            let query = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
-
-            if let Some(input_generation) = input_generation {
-                // Update includes generation column
-                query
-                    .set((
-                        generation.eq(Some(input_generation.into().unwrap() as i32)),
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            } else {
-                // Update does not include generation column
-                query
-                    .set((
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            }
-
-            Ok(())
-        })
-        .await?;
-
-        Ok(())
-    }
-
-    pub(crate) async fn update_tenant_config(
-        &self,
-        input_tenant_id: TenantId,
-        input_config: TenantConfig,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_conn(move |conn| {
-            diesel::update(tenant_shards)
-                .filter(tenant_id.eq(input_tenant_id.to_string()))
-                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
-                .execute(conn)?;
-
-            Ok(())
-        })
-        .await?;
-
-        Ok(())
+        Ok(Generation::new(updated.generation as u32))
    }

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -463,7 +364,7 @@ impl Persistence {
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
+                    generation_pageserver.eq(i64::MAX),
                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                ))
                .execute(conn)?;
@@ -589,15 +490,12 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) shard_stripe_size: i32,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching.
-    //
-    // Generation is only None when first onboarding a tenant, where it may
-    // be in PlacementPolicy::Secondary and therefore have no valid generation state.
-    pub(crate) generation: Option<i32>,
+    // and use the incremented number when attaching
+    pub(crate) generation: i32,

    // Currently attached pageserver
    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<i64>,
+    pub(crate) generation_pageserver: i64,

    #[serde(default)]
    pub(crate) placement_policy: String,
@@ -608,7 +506,7 @@ pub(crate) struct TenantShardPersistence {
 }

 /// Parts of [`crate::node::Node`] that are stored durably
-#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
 #[diesel(table_name = crate::schema::nodes)]
 pub(crate) struct NodePersistence {
    pub(crate) node_id: i64,
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
-use pageserver_api::controller_api::NodeAvailability;
+use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
-    pub(crate) generation: Option<Generation>,
+    pub(crate) generation: Generation,
    pub(crate) intent: TargetState,
    pub(crate) config: TenantConfig,
    pub(crate) observed: ObservedState,
@@ -104,7 +104,6 @@ impl Reconciler {
        node_id: NodeId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
-        lazy: bool,
    ) -> anyhow::Result<()> {
        let node = self
            .pageservers
@@ -119,7 +118,7 @@ impl Reconciler {
        let client =
            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
        client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
            .await?;
        tracing::info!("location_config({}) complete: {:?}", node_id, config);

@@ -313,16 +312,11 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedStale,
-            self.generation,
+            Some(self.generation),
            None,
        );
-        self.location_config(
-            origin_ps_id,
-            stale_conf,
-            Some(Duration::from_secs(10)),
-            false,
-        )
-        .await?;
+        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
+            .await?;

        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);

@@ -341,23 +335,21 @@ impl Reconciler {
        }

        // Increment generation before attaching to new pageserver
-        self.generation = Some(
-            self.persistence
-                .increment_generation(self.tenant_shard_id, dest_ps_id)
-                .await?,
-        );
+        self.generation = self
+            .persistence
+            .increment_generation(self.tenant_shard_id, dest_ps_id)
+            .await?;

        let dest_conf = build_location_config(
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedMulti,
-            self.generation,
+            Some(self.generation),
            None,
        );

        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None, false)
-            .await?;
+        self.location_config(dest_ps_id, dest_conf, None).await?;

        if let Some(baseline) = baseline_lsns {
            tracing::info!("🕑 Waiting for LSN to catch up...");
@@ -389,7 +381,7 @@ impl Reconciler {
            None,
            Some(LocationConfigSecondary { warm: true }),
        );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
            .await?;
        // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
        // partway through.  In fact, all location conf API calls should be in a wrapper that sets
@@ -409,10 +401,10 @@ impl Reconciler {
            &self.shard,
            &self.config,
            LocationConfigMode::AttachedSingle,
-            self.generation,
+            Some(self.generation),
            None,
        );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
            .await?;
        self.observed.locations.insert(
            dest_ps_id,
@@ -441,67 +433,24 @@ impl Reconciler {

        // If the attached pageserver is not attached, do so now.
        if let Some(node_id) = self.intent.attached {
-            // If we are in an attached policy, then generation must have been set (null generations
-            // are only present when a tenant is initially loaded with a secondary policy)
-            debug_assert!(self.generation.is_some());
-            let Some(generation) = self.generation else {
-                return Err(ReconcileError::Other(anyhow::anyhow!(
-                    "Attempted to attach with NULL generation"
-                )));
-            };
-
-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf =
+                attached_location_conf(self.generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
-                    tracing::info!(%node_id, "Observed configuration already correct.")
+                    tracing::info!("Observed configuration already correct.")
                }
-                observed => {
+                _ => {
                    // In all cases other than a matching observed configuration, we will
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.
-
-                    // The general case is to increment the generation.  However, there are cases
-                    // where this is not necessary:
-                    // - if we are only updating the TenantConf part of the location
-                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
-                    //   and the location was already in the correct generation
-                    let increment_generation = match observed {
-                        None => true,
-                        Some(ObservedStateLocation { conf: None }) => true,
-                        Some(ObservedStateLocation {
-                            conf: Some(observed),
-                        }) => {
-                            let generations_match = observed.generation == wanted_conf.generation;
-
-                            use LocationConfigMode::*;
-                            let mode_transition_requires_gen_inc =
-                                match (observed.mode, wanted_conf.mode) {
-                                    // Usually the short-lived attachment modes (multi and stale) are only used
-                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
-                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
-                                    (AttachedSingle, AttachedStale) => false,
-                                    (AttachedMulti, AttachedSingle) => false,
-                                    (lhs, rhs) => lhs != rhs,
-                                };
-
-                            !generations_match || mode_transition_requires_gen_inc
-                        }
-                    };
-
-                    if increment_generation {
-                        let generation = self
-                            .persistence
-                            .increment_generation(self.tenant_shard_id, node_id)
-                            .await?;
-                        self.generation = Some(generation);
-                        wanted_conf.generation = generation.into();
-                    }
-                    tracing::info!(%node_id, "Observed configuration requires update.");
-                    // Use lazy=true, because we may run many of Self concurrently, and do not want to
-                    // overload the pageserver with logical size calculations.
-                    self.location_config(node_id, wanted_conf, None, true)
+                    self.generation = self
+                        .persistence
+                        .increment_generation(self.tenant_shard_id, node_id)
                        .await?;
+                    wanted_conf.generation = self.generation.into();
+                    tracing::info!("Observed configuration requires update.");
+                    self.location_config(node_id, wanted_conf, None).await?;
                    self.compute_notify().await?;
                }
            }
@@ -553,7 +502,7 @@ impl Reconciler {
            if self.cancel.is_cancelled() {
                return Err(ReconcileError::Cancel);
            }
-            self.location_config(node_id, conf, None, false).await?;
+            self.location_config(node_id, conf, None).await?;
        }

        Ok(())
@@ -565,12 +514,7 @@ impl Reconciler {
        if let Some(node_id) = self.intent.attached {
            let result = self
                .compute_hook
-                .notify(
-                    self.tenant_shard_id,
-                    node_id,
-                    self.shard.stripe_size,
-                    &self.cancel,
-                )
+                .notify(self.tenant_shard_id, node_id, &self.cancel)
                .await;
            if let Err(e) = &result {
                // It is up to the caller whether they want to drop out on this error, but they don't have to:
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -175,33 +175,6 @@ impl Scheduler {
        }
    }

-    /// Where we have several nodes to choose from, for example when picking a secondary location
-    /// to promote to an attached location, this method may be used to pick the best choice based
-    /// on the scheduler's knowledge of utilization and availability.
-    ///
-    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
-    /// caller can pick a node some other way.
-    pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
-        if nodes.is_empty() {
-            return None;
-        }
-
-        let node = nodes
-            .iter()
-            .map(|node_id| {
-                let may_schedule = self
-                    .nodes
-                    .get(node_id)
-                    .map(|n| n.may_schedule)
-                    .unwrap_or(false);
-                (*node_id, may_schedule)
-            })
-            .max_by_key(|(_n, may_schedule)| *may_schedule);
-
-        // If even the preferred node has may_schedule==false, return None
-        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
-    }
-
    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
@@ -251,44 +224,44 @@ impl Scheduler {
    }
 }

-#[cfg(test)]
-pub(crate) mod test_utils {
-
-    use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
-    use std::collections::HashMap;
-    use utils::id::NodeId;
-    /// Test helper: synthesize the requested number of nodes, all in active state.
-    ///
-    /// Node IDs start at one.
-    pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
-        (1..n + 1)
-            .map(|i| {
-                (
-                    NodeId(i),
-                    Node {
-                        id: NodeId(i),
-                        availability: NodeAvailability::Active,
-                        scheduling: NodeSchedulingPolicy::Active,
-                        listen_http_addr: format!("httphost-{i}"),
-                        listen_http_port: 80 + i as u16,
-                        listen_pg_addr: format!("pghost-{i}"),
-                        listen_pg_port: 5432 + i as u16,
-                    },
-                )
-            })
-            .collect()
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
+    use std::collections::HashMap;
+
+    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use utils::id::NodeId;
+
+    use crate::{node::Node, tenant_state::IntentState};

-    use crate::tenant_state::IntentState;
    #[test]
    fn scheduler_basic() -> anyhow::Result<()> {
-        let nodes = test_utils::make_test_nodes(2);
+        let mut nodes = HashMap::new();
+        nodes.insert(
+            NodeId(1),
+            Node {
+                id: NodeId(1),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+                listen_http_addr: String::new(),
+                listen_http_port: 0,
+                listen_pg_addr: String::new(),
+                listen_pg_port: 0,
+            },
+        );
+
+        nodes.insert(
+            NodeId(2),
+            Node {
+                id: NodeId(2),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+                listen_http_addr: String::new(),
+                listen_http_port: 0,
+                listen_pg_addr: String::new(),
+                listen_pg_port: 0,
+            },
+        );

        let mut scheduler = Scheduler::new(nodes.values());
        let mut t1_intent = IntentState::new();
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -17,8 +17,8 @@ diesel::table! {
        shard_number -> Int4,
        shard_count -> Int4,
        shard_stripe_size -> Int4,
-        generation -> Nullable<Int4>,
-        generation_pageserver -> Nullable<Int8>,
+        generation -> Int4,
+        generation_pageserver -> Int8,
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,7 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

 use crate::{metrics, persistence::TenantShardPersistence};
-use pageserver_api::controller_api::NodeAvailability;
+use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -53,11 +53,8 @@ pub(crate) struct TenantState {
    pub(crate) sequence: Sequence,

    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching.
-    //
-    // None represents an incompletely onboarded tenant via the [`Service::location_config`]
-    // API, where this tenant may only run in PlacementPolicy::Secondary.
-    pub(crate) generation: Option<Generation>,
+    // and use the incremented number when attaching
+    pub(crate) generation: Generation,

    // High level description of how the tenant should be set up.  Provided
    // externally.
@@ -146,23 +143,6 @@ impl IntentState {
        }
    }

-    /// Like set_attached, but the node is from [`Self::secondary`].  This swaps the node from
-    /// secondary to attached while maintaining the scheduler's reference counts.
-    pub(crate) fn promote_attached(
-        &mut self,
-        _scheduler: &mut Scheduler,
-        promote_secondary: NodeId,
-    ) {
-        // If we call this with a node that isn't in secondary, it would cause incorrect
-        // scheduler reference counting, since we assume the node is already referenced as a secondary.
-        debug_assert!(self.secondary.contains(&promote_secondary));
-
-        // TODO: when scheduler starts tracking attached + secondary counts separately, we will
-        // need to call into it here.
-        self.secondary.retain(|n| n != &promote_secondary);
-        self.attached = Some(promote_secondary);
-    }
-
    pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
        debug_assert!(!self.secondary.contains(&new_secondary));
        scheduler.node_inc_ref(new_secondary);
@@ -184,13 +164,6 @@ impl IntentState {
        }
    }

-    /// Remove the last secondary node from the list of secondaries
-    pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
-        if let Some(node_id) = self.secondary.pop() {
-            scheduler.node_dec_ref(node_id);
-        }
-    }
-
    pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
        if let Some(old_attached) = self.attached.take() {
            scheduler.node_dec_ref(old_attached);
@@ -218,16 +191,12 @@ impl IntentState {
        &self.secondary
    }

-    /// If the node is in use as the attached location, demote it into
-    /// the list of secondary locations.  This is used when a node goes offline,
-    /// and we want to use a different node for attachment, but not permanently
-    /// forget the location on the offline node.
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
    ///
    /// Returns true if a change was made
-    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
        if self.attached == Some(node_id) {
-            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
-            // need to call into it here.
            self.attached = None;
            self.secondary.push(node_id);
            true
@@ -327,7 +296,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) result: Result<(), ReconcileError>,

    pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) generation: Option<Generation>,
+    pub(crate) generation: Generation,
    pub(crate) observed: ObservedState,

    /// Set [`TenantState::pending_compute_notification`] from this flag
@@ -352,7 +321,7 @@ impl TenantState {
            tenant_shard_id,
            policy,
            intent: IntentState::default(),
-            generation: Some(Generation::new(0)),
+            generation: Generation::new(0),
            shard,
            observed: ObservedState::default(),
            config: TenantConfig::default(),
@@ -401,9 +370,6 @@ impl TenantState {
        // All remaining observed locations generate secondary intents.  This includes None
        // observations, as these may well have some local content on disk that is usable (this
        // is an edge case that might occur if we restarted during a migration or other change)
-        //
-        // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
-        // will take care of promoting one of these secondaries to be attached.
        self.observed.locations.keys().for_each(|node_id| {
            if Some(*node_id) != self.intent.attached {
                self.intent.secondary.push(*node_id);
@@ -411,33 +377,6 @@ impl TenantState {
        });
    }

-    /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
-    /// attached pageserver for a shard.
-    ///
-    /// Returns whether we modified it, and the NodeId selected.
-    fn schedule_attached(
-        &mut self,
-        scheduler: &mut Scheduler,
-    ) -> Result<(bool, NodeId), ScheduleError> {
-        // No work to do if we already have an attached tenant
-        if let Some(node_id) = self.intent.attached {
-            return Ok((false, node_id));
-        }
-
-        if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
-            // Promote a secondary
-            tracing::debug!("Promoted secondary {} to attached", promote_secondary);
-            self.intent.promote_attached(scheduler, promote_secondary);
-            Ok((true, promote_secondary))
-        } else {
-            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
-            tracing::debug!("Selected {} as attached", node_id);
-            self.intent.set_attached(scheduler, Some(node_id));
-            Ok((true, node_id))
-        }
-    }
-
    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
@@ -448,49 +387,33 @@ impl TenantState {

        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
+        let mut used_pageservers = self.intent.all_pageservers();
        let mut modified = false;

-        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
            Single => {
                // Should have exactly one attached, and zero secondaries
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.set_attached(scheduler, Some(node_id));
+                    used_pageservers.push(node_id);
                    modified = true;
                }
-
-                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
-                modified |= modified_attached;
-
                if !self.intent.secondary.is_empty() {
                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
            }
            Double(secondary_count) => {
-                let retain_secondaries = if self.intent.attached.is_none()
-                    && scheduler.node_preferred(&self.intent.secondary).is_some()
-                {
-                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
-                    // one more secondary than we usually would, as one of them will become attached futher down this function.
-                    secondary_count + 1
-                } else {
-                    secondary_count
-                };
-
-                while self.intent.secondary.len() > retain_secondaries {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
+                // Should have exactly one attached, and N secondaries
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.set_attached(scheduler, Some(node_id));
+                    used_pageservers.push(node_id);
                    modified = true;
                }

-                // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
-                modified |= modified_attached;
-
-                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
                    self.intent.push_secondary(scheduler, node_id);
@@ -498,28 +421,15 @@ impl TenantState {
                    modified = true;
                }
            }
-            Secondary => {
-                if let Some(node_id) = self.intent.get_attached() {
-                    // Populate secondary by demoting the attached node
-                    self.intent.demote_attached(*node_id);
-                    modified = true;
-                } else if self.intent.secondary.is_empty() {
-                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[])?;
-                    self.intent.push_secondary(scheduler, node_id);
-                    modified = true;
-                }
-                while self.intent.secondary.len() > 1 {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-            }
            Detached => {
-                // Never add locations in this mode
-                if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
-                    self.intent.clear(scheduler);
+                // Should have no attached or secondary pageservers
+                if self.intent.attached.is_some() {
+                    self.intent.set_attached(scheduler, None);
+                    modified = true;
+                }
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
            }
@@ -566,12 +476,7 @@ impl TenantState {

    fn dirty(&self) -> bool {
        if let Some(node_id) = self.intent.attached {
-            // Maybe panic: it is a severe bug if we try to attach while generation is null.
-            let generation = self
-                .generation
-                .expect("Attempted to enter attached state without a generation");
-
-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -649,10 +554,6 @@ impl TenantState {
        // Reconcile already in flight for the current sequence?
        if let Some(handle) = &self.reconciler {
            if handle.sequence == self.sequence {
-                tracing::info!(
-                    "Reconciliation already in progress for sequence {:?}",
-                    self.sequence,
-                );
                return Some(ReconcilerWaiter {
                    tenant_shard_id: self.tenant_shard_id,
                    seq_wait: self.waiter.clone(),
@@ -672,10 +573,6 @@ impl TenantState {
            return None;
        };

-        // Advance the sequence before spawning a reconciler, so that sequence waiters
-        // can distinguish between before+after the reconcile completes.
-        self.sequence = self.sequence.next();
-
        let reconciler_cancel = cancel.child_token();
        let mut reconciler = Reconciler {
            tenant_shard_id: self.tenant_shard_id,
@@ -777,17 +674,6 @@ impl TenantState {
        })
    }

-    /// Called when a ReconcileResult has been emitted and the service is updating
-    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
-    /// the handle to indicate there is no longer a reconciliation in progress.
-    pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
-        if let Some(reconcile_handle) = &self.reconciler {
-            if reconcile_handle.sequence <= sequence {
-                self.reconciler = None;
-            }
-        }
-    }
-
    // If we had any state at all referring to this node ID, drop it.  Does not
    // attempt to reschedule.
    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -808,93 +694,11 @@ impl TenantState {
            shard_number: self.tenant_shard_id.shard_number.0 as i32,
            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
            shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
-            generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
+            generation: self.generation.into().unwrap_or(0) as i32,
+            generation_pageserver: i64::MAX,
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
        }
    }
 }
-
-#[cfg(test)]
-pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
-    use utils::id::TenantId;
-
-    use crate::scheduler::test_utils::make_test_nodes;
-
-    use super::*;
-
-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
-        let tenant_id = TenantId::generate();
-        let shard_number = ShardNumber(0);
-        let shard_count = ShardCount::new(1);
-
-        let tenant_shard_id = TenantShardId {
-            tenant_id,
-            shard_number,
-            shard_count,
-        };
-        TenantState::new(
-            tenant_shard_id,
-            ShardIdentity::new(
-                shard_number,
-                shard_count,
-                pageserver_api::shard::ShardStripeSize(32768),
-            )
-            .unwrap(),
-            policy,
-        )
-    }
-
-    /// Test the scheduling behaviors used when a tenant configured for HA is subject
-    /// to nodes being marked offline.
-    #[test]
-    fn tenant_ha_scheduling() -> anyhow::Result<()> {
-        // Start with three nodes.  Our tenant will only use two.  The third one is
-        // expected to remain unused.
-        let mut nodes = make_test_nodes(3);
-
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
-        tenant_state
-            .schedule(&mut scheduler)
-            .expect("we have enough nodes, scheduling should work");
-
-        // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_state.intent.secondary.len(), 1);
-        assert!(tenant_state.intent.attached.is_some());
-
-        let attached_node_id = tenant_state.intent.attached.unwrap();
-        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
-        assert_ne!(attached_node_id, secondary_node_id);
-
-        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.demote_attached(attached_node_id);
-        assert!(changed);
-        assert!(tenant_state.intent.attached.is_none());
-        assert_eq!(tenant_state.intent.secondary.len(), 2);
-
-        // Update the scheduler state to indicate the node is offline
-        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
-        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
-
-        // Scheduling the node should promote the still-available secondary node to attached
-        tenant_state
-            .schedule(&mut scheduler)
-            .expect("active nodes are available");
-        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
-
-        // The original attached node should have been retained as a secondary
-        assert_eq!(
-            *tenant_state.intent.secondary.iter().last().unwrap(),
-            attached_node_id
-        );
-
-        tenant_state.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,12 +2,8 @@ use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
-    controller_api::{
-        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
-    },
    models::{
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
        TimelineCreateRequest, TimelineInfo,
    },
    shard::TenantShardId,
@@ -15,12 +11,12 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr};
+use std::str::FromStr;
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
 use utils::{
-    auth::{encode_from_key_file, Claims, Scope},
+    auth::{Claims, Scope},
    id::{NodeId, TenantId},
 };

@@ -28,7 +24,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: Utf8PathBuf,
-    private_key: Option<Vec<u8>>,
+    jwt_token: Option<String>,
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
@@ -59,6 +55,126 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
@@ -88,11 +204,12 @@ impl AttachmentService {
            .pageservers
            .first()
            .expect("Config is validated to contain at least one pageserver");
-        let (private_key, public_key) = match ps_conf.http_auth_type {
+        let (jwt_token, public_key) = match ps_conf.http_auth_type {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
-                let private_key_path = env.get_private_key_path();
-                let private_key = fs::read(private_key_path).expect("failed to read private key");
+                let jwt_token = env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();

                // If pageserver auth is enabled, this implicitly enables auth for this service,
                // using the same credentials.
@@ -118,7 +235,7 @@ impl AttachmentService {
                } else {
                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
                };
-                (Some(private_key), Some(public_key))
+                (Some(jwt_token), Some(public_key))
            }
        };

@@ -126,7 +243,7 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
-            private_key,
+            jwt_token,
            public_key,
            postgres_port,
            client: reqwest::ClientBuilder::new()
@@ -200,7 +317,7 @@ impl AttachmentService {
                "localhost",
                "-p",
                &format!("{}", self.postgres_port),
-                DB_NAME,
+                &DB_NAME,
            ])
            .output()
            .await
@@ -280,10 +397,7 @@ impl AttachmentService {
        .into_iter()
        .map(|s| s.to_string())
        .collect::<Vec<_>>();
-        if let Some(private_key) = &self.private_key {
-            let claims = Claims::new(None, Scope::PageServerApi);
-            let jwt_token =
-                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
+        if let Some(jwt_token) = &self.jwt_token {
            args.push(format!("--jwt-token={jwt_token}"));
        }

@@ -308,7 +422,7 @@ impl AttachmentService {
            )],
            background_process::InitialPidFile::Create(self.pid_file()),
            || async {
-                match self.ready().await {
+                match self.status().await {
                    Ok(_) => Ok(true),
                    Err(_) => Ok(false),
                }
@@ -354,20 +468,6 @@ impl AttachmentService {
        Ok(())
    }

-    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
-        let category = match path.find('/') {
-            Some(idx) => &path[..idx],
-            None => path,
-        };
-
-        match category {
-            "status" | "ready" => Ok(None),
-            "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
-            "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
-            _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
-        }
-    }
-
    /// Simple HTTP request wrapper for calling into attachment service
    async fn dispatch<RQ, RS>(
        &self,
@@ -393,16 +493,11 @@ impl AttachmentService {
        if let Some(body) = body {
            builder = builder.json(&body)
        }
-        if let Some(private_key) = &self.private_key {
-            println!("Getting claims for path {}", path);
-            if let Some(required_claims) = Self::get_claims_for_path(&path)? {
-                println!("Got claims {:?} for path {}", required_claims, path);
-                let jwt_token = encode_from_key_file(&required_claims, private_key)?;
-                builder = builder.header(
-                    reqwest::header::AUTHORIZATION,
-                    format!("Bearer {jwt_token}"),
-                );
-            }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
        }

        let response = builder.send().await?;
@@ -522,8 +617,8 @@ impl AttachmentService {
    }

    #[instrument(skip(self))]
-    pub async fn ready(&self) -> anyhow::Result<()> {
-        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
+    pub async fn status(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
            .await
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,15 +8,14 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
+use control_plane::attachment_service::{
+    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
-};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -617,7 +616,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
            let tenant_id = get_tenant_id(create_match, env)?;
            let new_branch_name = create_match
                .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?;
+                .ok_or_else(|| anyhow!("No branch name provided"))?; // TODO

            let pg_version = create_match
                .get_one::<u32>("pg-version")
@@ -1024,7 +1023,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        })
                        .collect::<Vec<_>>()
                };
-            endpoint.reconfigure(pageservers, None).await?;
+            endpoint.reconfigure(pageservers).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -52,7 +52,6 @@ use compute_api::spec::RemoteExtSpec;
 use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
-use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -591,7 +590,6 @@ impl Endpoint {
            remote_extensions,
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
-            primary_is_running: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -606,7 +604,7 @@ impl Endpoint {
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
        if create_test_user {
-            let conn_str = self.connstr("test", "neondb");
+            let conn_str = self.connstr("user", "neondb");
            println!("Also at '{}'", conn_str);
        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
@@ -736,11 +734,7 @@ impl Endpoint {
        }
    }

-    pub async fn reconfigure(
-        &self,
-        mut pageservers: Vec<(Host, u16)>,
-        stripe_size: Option<ShardStripeSize>,
-    ) -> Result<()> {
+    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -770,9 +764,6 @@ impl Endpoint {
        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
        assert!(!pageserver_connstr.is_empty());
        spec.pageserver_connstring = Some(pageserver_connstr);
-        if stripe_size.is_some() {
-            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
-        }

        let client = reqwest::Client::new();
        let response = client
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -412,17 +412,14 @@ impl LocalEnv {

    // this function is used only for testing purposes in CLI e g generate tokens during init
    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
-        let private_key_path = self.get_private_key_path();
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
-    }
-
-    pub fn get_private_key_path(&self) -> PathBuf {
-        if self.private_key_path.is_absolute() {
+        let private_key_path = if self.private_key_path.is_absolute() {
            self.private_key_path.to_path_buf()
        } else {
            self.base_data_dir.join(&self.private_key_path)
-        }
+        };
+
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
    }

    //
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,6 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -31,7 +30,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::attachment_service::AttachmentService;
+use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

@@ -116,7 +115,7 @@ impl PageServerNode {
            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
-                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
                    .unwrap();
                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
            }
@@ -211,25 +210,6 @@ impl PageServerNode {
        update_config: bool,
        register: bool,
    ) -> anyhow::Result<()> {
-        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
-        // successfully call /re-attach and finish starting up.
-        if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            attachment_service
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -268,6 +248,23 @@ impl PageServerNode {
        )
        .await?;

+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
        Ok(())
    }

@@ -353,11 +350,6 @@ impl PageServerNode {
                .remove("compaction_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            compaction_algorithm: settings
-                .remove("compaction_algorithm")
-                .map(serde_json::from_str)
-                .transpose()
-                .context("Failed to parse 'compaction_algorithm' json")?,
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
@@ -397,6 +389,11 @@ impl PageServerNode {
            evictions_low_residence_duration_metric_threshold: settings
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
+            gc_feedback: settings
+                .remove("gc_feedback")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
            lazy_slru_download: settings
                .remove("lazy_slru_download")
@@ -461,11 +458,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'compaction_threshold' as an integer")?,
-                compaction_algorithm: settings
-                    .remove("compactin_algorithm")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'compaction_algorithm' json")?,
                gc_horizon: settings
                    .remove("gc_horizon")
                    .map(|x| x.parse::<u64>())
@@ -507,6 +499,11 @@ impl PageServerNode {
                evictions_low_residence_duration_metric_threshold: settings
                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
+                gc_feedback: settings
+                    .remove("gc_feedback")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'gc_feedback' as bool")?,
                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
                lazy_slru_download: settings
                    .remove("lazy_slru_download")
@@ -537,11 +534,10 @@ impl PageServerNode {
        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
-        lazy: bool,
    ) -> anyhow::Result<()> {
        Ok(self
            .http_client
-            .location_config(tenant_shard_id, config, flush_ms, lazy)
+            .location_config(tenant_shard_id, config, flush_ms)
            .await?)
    }

@@ -606,7 +602,7 @@ impl PageServerNode {
                eprintln!("connection error: {}", e);
            }
        });
-        let client = std::pin::pin!(client);
+        tokio::pin!(client);

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,9 +70,6 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.

-"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
-
-"admin": Provides access to the control plane and admin APIs of the attachment service.

 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -1,4 +1,4 @@
-# Neon storage node — alternative
+# Zenith storage node — alternative

 ## **Design considerations**

--- a/docs/rfcs/003-laptop-cli.md
+++ b/docs/rfcs/003-laptop-cli.md
@@ -1,6 +1,6 @@
 # Command line interface (end-user)

-Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start.
+Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.

 This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.

@@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle

 # Possible usage scenarios

-## Install neon, run a postgres
+## Install zenith, run a postgres

 ```
-> brew install pg-neon 
-> neon pg create # creates pgdata with default pattern pgdata$i
-> neon pg list
+> brew install pg-zenith 
+> zenith pg create # creates pgdata with default pattern pgdata$i
+> zenith pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       0G      neon-local       localhost:5432
+primary1      pgdata1       0G      zenith-local       localhost:5432
 ```

-## Import standalone postgres to neon
+## Import standalone postgres to zenith

 ```
-> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg
+> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
 [====================------------] 60% | 20MB/s
-> neon snapshot list
+> zenith snapshot list
 ID          SIZE        PARENT
 oldpg       5G          -

-> neon pg create --snapshot oldpg
+> zenith pg create --snapshot oldpg
 Started postgres on localhost:5432

-> neon pg list
+> zenith pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      neon-local       localhost:5432
+primary1      pgdata1       5G      zenith-local       localhost:5432

-> neon snapshot destroy oldpg
+> zenith snapshot destroy oldpg
 Ok
 ```

 Also, we may start snapshot import implicitly by looking at snapshot schema

 ```
-> neon pg create --snapshot basebackup://replication@localhost:5432/
+> zenith pg create --snapshot basebackup://replication@localhost:5432/
 Downloading snapshot... Done.
 Started postgres on localhost:5432
 Destroying snapshot... Done.
@@ -52,39 +52,39 @@ Destroying snapshot... Done.
 Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).

 ```
-> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies
+> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
 ```

 ## Create snapshot and push it to the cloud

 ```
-> neon snapshot create pgdata1@snap1
-> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1
+> zenith snapshot create pgdata1@snap1
+> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
 ```

 ## Rollback database to the snapshot

-One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`.
+One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.

 ```
-> neon pg list
+> zenith pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      neon-local       localhost:5432
+primary1      pgdata1       5G      zenith-local       localhost:5432

-> neon snapshot create pgdata1@snap1
+> zenith snapshot create pgdata1@snap1

-> neon snapshot list
+> zenith snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
 pgdata1@CURRENT       6G          -

-> neon pg checkout pgdata1@snap1
+> zenith pg checkout pgdata1@snap1
 Stopping postgres on pgdata1.
 Rolling back pgdata1@CURRENT to pgdata1@snap1.
 Starting postgres on pgdata1.

-> neon snapshot list
+> zenith snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
@@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state
 PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).

 ```
-> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month
+> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
 ```

 Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
@@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o

 ## storage

-Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
+Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.

-**neon storage attach** -t [native|s3] -c key=value -n name
+**zenith storage attach** -t [native|s3] -c key=value -n name

-Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'.
+Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.


-**neon storage list**
+**zenith storage list**

 Show currently attached storages. For example:

 ```
-> neon storage list
+> zenith storage list
 NAME            USED    TYPE                OPTIONS          PATH
-local           5.1G    neon-local                         /opt/neon/store/local
-local.compr     20.4G   neon-local        compression=on    /opt/neon/store/local.compr
-zcloud          60G     neon-remote                        neon.tech/stas/mystore
+local           5.1G    zenith-local                         /opt/zenith/store/local
+local.compr     20.4G   zenith-local        compression=on    /opt/zenith/store/local.compr
+zcloud          60G     zenith-remote                        zenith.tech/stas/mystore
 s3tank          80G     S3
 ```

-**neon storage detach**
+**zenith storage detach**

-**neon storage show**
+**zenith storage show**



@@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c

 Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together.

-**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
+**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata

 Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.

 --no-start: just init datadir without creating 

--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1)
+--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)

 --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)

-**neon pg destroy**
+**zenith pg destroy**

-**neon pg start** [--replica] pgdata
+**zenith pg start** [--replica] pgdata

 Start postgres with proper extensions preloaded/installed.

-**neon pg checkout**
+**zenith pg checkout**

 Rollback data directory to some previous snapshot. 

-**neon pg stop** pg_id
+**zenith pg stop** pg_id

-**neon pg list**
+**zenith pg list**

 ```
 ROLE                 PGDATA        USED    STORAGE            ENDPOINT
@@ -173,7 +173,7 @@ primary              my_pg2        3.2G    local.compr        localhost:5435
 -                    my_pg3        9.2G    local.compr        -
 ```

-**neon pg show**
+**zenith pg show**

 ```
 my_pg:
@@ -194,7 +194,7 @@ my_pg:

 ```

-**neon pg start-rest/graphql** pgdata
+**zenith pg start-rest/graphql** pgdata

 Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.

@@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that,

 Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.

-**neon snapshot create** pgdata_name@snap_name
+**zenith snapshot create** pgdata_name@snap_name

 Creates a new snapshot in the same storage where pgdata_name exists.

-**neon snapshot push** --to url pgdata_name@snap_name
+**zenith snapshot push** --to url pgdata_name@snap_name

-Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go.
+Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.

-**neon snapshot recv**
+**zenith snapshot recv**

 Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.

-**neon snapshot pull** --from url or path
+**zenith snapshot pull** --from url or path

-Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format.
+Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.

-**neon snapshot import** --from basebackup://<...>  or path
+**zenith snapshot import** --from basebackup://<...>  or path

 Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.

-**neon snapshot export**
+**zenith snapshot export**

-Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay).
+Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).

-**neon snapshot diff** snap1 snap2
+**zenith snapshot diff** snap1 snap2

 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.

-**neon snapshot destroy**
+**zenith snapshot destroy**

 ## pitr

@@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream

 XXX: any suggestions on a better name?

-**neon pitr create** name
+**zenith pitr create** name

 --ttl = inf | period

@@ -247,21 +247,21 @@ XXX: any suggestions on a better name?

 --storage = storage_name

-**neon pitr extract-snapshot** pitr_name --lsn xxx
+**zenith pitr extract-snapshot** pitr_name --lsn xxx

 Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)

-**neon pitr gc** pitr_name
+**zenith pitr gc** pitr_name

 Force garbage collection on some PITR area.

-**neon pitr list**
+**zenith pitr list**

-**neon pitr destroy**
+**zenith pitr destroy**


 ## console

-**neon console**
+**zenith console**

 Opens browser targeted at web console with the more or less same functionality as described here.
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can
 acknowledge the commit to the client and be reasonably certain that we
 will not lose the transaction?

-Neon uses a group of WAL safekeeper nodes to hold the generated WAL.
+Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
 A WAL record is considered durable, when it has been written to a
 majority of WAL safekeeper nodes. In this document, I use 5
 safekeepers, because I have five fingers. A WAL record is durable,
--- a/docs/rfcs/005-zenith_local.md
+++ b/docs/rfcs/005-zenith_local.md
@@ -1,23 +1,23 @@
-# Neon local
+# Zenith local

-Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
+Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.

 #### Why do we need it?
 - For distribution - this easy to use binary will help us to build adoption among developers.
 - For internal use - to test all components together.

-In my understanding, we consider it to be just a mock-up version of neon-cloud.
+In my understanding, we consider it to be just a mock-up version of zenith-cloud.
 > Question: How much should we care about durability and security issues for a local setup?


 #### Why is it better than a simple local postgres?

- Easy one-line setup. As simple as `cargo install neon && neon start`
+- Easy one-line setup. As simple as `cargo install zenith && zenith start`

 - Quick and cheap creation of compute nodes over the same storage.
 > Question: How can we describe a use-case for this feature?

- Neon-local can work with S3 directly. 
+- Zenith-local can work with S3 directly. 

 - Push and pull images (snapshots) to remote S3 to exchange data with other users.

@@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need.

 #### Components:

- **neon-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
-CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
-WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli
+- **zenith-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
+CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli

- **neon-console** - WEB UI with same functionality as CLI.
+- **zenith-console** - WEB UI with same functionality as CLI.
 >Note: not for the first release.

- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
-    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local.
+- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
+    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.

- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
+- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
 > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?

-WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src

- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon.
+- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
 > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
 > Question: Do we use it together with local page store or they are interchangeable?

 WIP code is ???

- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
+- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
 > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.

-WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper
+WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper

- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
+- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
 
- WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node
+ WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node

 #### REST API:

 Service endpoint: `http://localhost:3000`

 Resources:
- /storages - Where data lives: neon-pageserver or neon-s3
- /pgs - Postgres - neon-computenode
+- /storages - Where data lives: zenith-pageserver or zenith-s3
+- /pgs - Postgres - zenith-computenode
 - /snapshots - snapshots **TODO**

->Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
+>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?

 Methods and their mapping to CLI:

- /storages - neon-pageserver or neon-s3
+- /storages - zenith-pageserver or zenith-s3

 CLI  | REST API
 ------------- | -------------
@@ -84,7 +84,7 @@ storage list | GET /storages
 storage show -n name | GET /storages/:storage_name 


- /pgs - neon-computenode
+- /pgs - zenith-computenode

 CLI  | REST API
 ------------- | -------------
--- a/docs/rfcs/006-laptop-cli-v2-CLI.md
+++ b/docs/rfcs/006-laptop-cli-v2-CLI.md
@@ -1,45 +1,45 @@
-Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
+Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".

 # CLI v2 (after chatting with Carl)

-Neon introduces the notion of a repository.
+Zenith introduces the notion of a repository.

 ```bash
-neon init
-neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory
+zenith init
+zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
 ```

 Once you have a cluster catalog you can explore it

 ```bash
-neon log -- returns a list of commits
-neon status -- returns if there are changes in the catalog that can be committed
-neon commit -- commits the changes and generates a new commit hash
-neon branch experimental <hash> -- creates a branch called testdb based on a given commit hash
+zenith log -- returns a list of commits
+zenith status -- returns if there are changes in the catalog that can be committed
+zenith commit -- commits the changes and generates a new commit hash
+zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
 ```

 To make changes in the catalog you need to run compute nodes

 ```bash
 -- here is how you a compute node
-neon start /home/pipedpiper/northwind:main -- starts a compute instance
-neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud
+zenith start /home/pipedpiper/northwind:main -- starts a compute instance
+zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
 -- you can start a compute node against any hash or branch
-neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
+zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
 -- you can start a compute node against any hash or branch
-neon start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
+zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)

 -- After running some DML you can run 
-- neon status and see how there are two WAL streams one on top of 
+-- zenith status and see how there are two WAL streams one on top of 
 -- the main branch
-neon status 
+zenith status 
 -- and another on top of the experimental branch
-neon status -b experimental
+zenith status -b experimental

 -- you can commit each branch separately
-neon commit main
+zenith commit main
 -- or
-neon commit -c /home/pipedpiper/northwind:experimental
+zenith commit -c /home/pipedpiper/northwind:experimental
 ```

 Starting compute instances against cloud environments
@@ -47,18 +47,18 @@ Starting compute instances against cloud environments
 ```bash
 -- you can start a compute instance against the cloud environment
 -- in this case all of the changes will be streamed into the cloud
-neon start https://neon:tecj/pipedpiper/northwind:main
-neon start https://neon:tecj/pipedpiper/northwind:main
-neon status -c https://neon:tecj/pipedpiper/northwind:main
-neon commit -c https://neon:tecj/pipedpiper/northwind:main
-neon branch -c https://neon:tecj/pipedpiper/northwind:<hash> experimental
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith status -c https://zenith:tech/pipedpiper/northwind:main
+zenith commit -c https://zenith:tech/pipedpiper/northwind:main
+zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
 ```

 Pushing data into the cloud

 ```bash
 -- pull all the commits from the cloud
-neon pull
+zenith pull
 -- push all the commits to the cloud
-neon push
+zenith push
 ```
--- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md
+++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
@@ -1,14 +1,14 @@
 # Repository format

-A Neon repository is similar to a traditional PostgreSQL backup
+A Zenith repository is similar to a traditional PostgreSQL backup
 archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
 multiple versions of a PostgreSQL database cluster.

-The distinguishing feature is that you can launch a Neon Postgres
+The distinguishing feature is that you can launch a Zenith Postgres
 server directly against a branch in the repository, without having to
-"restore" it first. Also, Neon manages the storage automatically,
+"restore" it first. Also, Zenith manages the storage automatically,
 there is no separation between full and incremental backups nor WAL
-archive. Neon relies heavily on the WAL, and uses concepts similar
+archive. Zenith relies heavily on the WAL, and uses concepts similar
 to incremental backups and WAL archiving internally, but it is hidden
 from the user.

@@ -19,15 +19,15 @@ efficient. Just something to get us started.

 The repository directory looks like this:

-    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
-    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
-    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
    
-    .neon/refs/branches/mybranch
-    .neon/refs/tags/foo
-    .neon/refs/tags/bar
+    .zenith/refs/branches/mybranch
+    .zenith/refs/tags/foo
+    .zenith/refs/tags/bar
    
-    .neon/datadirs/<timeline uuid>
+    .zenith/datadirs/<timeline uuid>

 ### Timelines

@@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node
 against a tag or arbitrary LSN on a timeline, but in order to write,
 you need to create a timeline.

-Each timeline is stored in a directory under .neon/timelines. It
+Each timeline is stored in a directory under .zenith/timelines. It
 consists of a WAL archive, containing all the WAL in the standard
 PostgreSQL format, under the wal/ subdirectory.

@@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags).

 ### Datadirs

-.neon/datadirs contains PostgreSQL data directories. You can launch
+.zenith/datadirs contains PostgreSQL data directories. You can launch
 a Postgres instance on one of them with:

 ```
-  postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
+  postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
 ```

 All the actual data is kept in the timeline directories, under
-.neon/timelines. The data directories are only needed for active
+.zenith/timelines. The data directories are only needed for active
 PostgreQSL instances. After an instance is stopped, the data directory
-can be safely removed. "neon start" will recreate it quickly from
-the data in .neon/timelines, if it's missing.
+can be safely removed. "zenith start" will recreate it quickly from
+the data in .zenith/timelines, if it's missing.

 ## Version 2

@@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support:

 ### Garbage collection

-When you run "neon gc", old timelines that are no longer needed are
+When you run "zenith gc", old timelines that are no longer needed are
 removed. That involves collecting the list of "unreachable" objects,
 starting from the named branches and tags.

 Also, if enough WAL has been generated on a timeline since last
 snapshot, a new snapshot or delta is created.

-### neon push/pull
+### zenith push/pull

 Compare the tags and branches on both servers, and copy missing ones.
 For each branch, compare the timeline it points to in both servers. If
@@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the
 timelines have diverged. That would match with the "epoch" concept
 that we have in the WAL safekeeper

-### neon checkout/commit
+### zenith checkout/commit

 In this format, there is no concept of a "working tree", and hence no
 concept of checking out or committing. All modifications are done on
@@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree".
 You can later remove it and have it garbage collected, or to "commit",
 re-point the branch to the new timeline.

-If we want to have a worktree and "neon checkout/commit" concept, we can
+If we want to have a worktree and "zenith checkout/commit" concept, we can
 emulate that with a temporary timeline. Create the temporary timeline at
-"neon checkout", and have "neon commit" modify the branch to point to
+"zenith checkout", and have "zenith commit" modify the branch to point to
 the new timeline.
--- a/docs/rfcs/007-serverless-on-laptop.md
+++ b/docs/rfcs/007-serverless-on-laptop.md
@@ -4,27 +4,27 @@ How it works now
 1. Create repository, start page server on it

 ```
-$ neon init
+$ zenith init
 ...
 created main branch
-new neon repository was created in .neon
+new zenith repository was created in .zenith

-$ neon pageserver start
-Starting pageserver at '127.0.0.1:64000' in .neon
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
 Page server started
 ```

 2. Create a branch, and start a Postgres instance on it

 ```
-$ neon branch heikki main
+$ zenith branch heikki main
 branching at end of WAL: 0/15ECF68

-$ neon pg create heikki
+$ zenith pg create heikki
 Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432

-$ neon pg start pg1
+$ zenith pg start pg1
 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
 waiting for server to start.... done
 server started
@@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just:
 1. Create repository, start page server on it (same as before)

 ```
-$ neon init
+$ zenith init
 ...
 created main branch
-new neon repository was created in .neon
+new zenith repository was created in .zenith

-$ neon pageserver start
-Starting pageserver at '127.0.0.1:64000' in .neon
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
 Page server started
 ```

 2. Create branch

 ```
-$ neon branch heikki main
+$ zenith branch heikki main
 branching at end of WAL: 0/15ECF68
 ```

--- a/docs/rfcs/008-push-pull.md
+++ b/docs/rfcs/008-push-pull.md
@@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W
 The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).

 ```
-neon origin add <name> <connection_uri>
-neon origin list
-neon origin remove <name>
+zenith origin add <name> <connection_uri>
+zenith origin list
+zenith origin remove <name>
 ```

 Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.

-Behind the scenes, this commands may update toml file inside .neon directory.
+Behind the scenes, this commands may update toml file inside .zenith directory.

 ## Push

 ### Pushing branch

 ```
-neon push mybranch cloudserver # push to eponymous branch in cloudserver
-neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
+zenith push mybranch cloudserver # push to eponymous branch in cloudserver
+zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
 ```

 Exact mechanics would be slightly different in the following situations:
--- a/docs/rfcs/009-snapshot-first-storage-cli.md
+++ b/docs/rfcs/009-snapshot-first-storage-cli.md
@@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well

 We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.

-Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon.
+Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith.

 So here is an attempt to design consistent CLI for different usage scenarios:

@@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config.
 Push snapshots to `storage_dest` in background.

 ```
-neon init --storage_dest=S3_PREFIX
-neon start
+zenith init --storage_dest=S3_PREFIX
+zenith start
 ```

 #### 2. Restart pageserver (manually or crash-recovery).
@@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho
 Push snapshots to `storage_dest` in background.

 ```
-neon start
+zenith start
 ```

 #### 3. Import.
@@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time
 Save`storage_dest` parameters in config.
 Push snapshots to `storage_dest` in background.
 ```
-//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage.
-neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
-neon start
+//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
+zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
+zenith start
 ```
 How to pass credentials needed for `snapshot_path`?

 #### 4. Export.
 Manually push snapshot to `snapshot_path` which differs from `storage_dest`
-Optionally set `snapshot_format`, which can be plain pgdata format or neon format.
+Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
 ```
-neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
+zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
 ```

 #### Notes and questions
 - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
- Why do we need `neon init` as a separate command? Can't we init everything at first start?
+- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
 - We can think of better names for all options.
 - Export to plain postgres format will be useless, if we are not 100% compatible on page level.
 I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
--- a/docs/rfcs/013-term-history.md
+++ b/docs/rfcs/013-term-history.md
@@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when
 it has received all committed log records from all `< n` terms. This roughly
 corresponds to proposed in

-https://github.com/neondatabase/rfcs/pull/3/files
+https://github.com/zenithdb/rfcs/pull/3/files


 This makes our biggest our difference from Raft. In Raft, every log record is
--- a/docs/rfcs/014-safekeepers-gossip.md
+++ b/docs/rfcs/014-safekeepers-gossip.md
@@ -1,6 +1,6 @@
 # Safekeeper gossip

-Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13)
+Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)

 ## Motivation

--- a/docs/rfcs/015-storage-messaging.md
+++ b/docs/rfcs/015-storage-messaging.md
@@ -2,7 +2,7 @@

 Created on 19.01.22

-Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich.
+Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.

 That it is an alternative to (014-safekeeper-gossip)[]

@@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation:
 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
 2. etcd uses Grpc as a protocol, and messages are pretty simple

-So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
+So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -79,12 +79,6 @@ pub struct ComputeSpec {
    // Stripe size for pageserver sharding, in pages
    #[serde(default)]
    pub shard_stripe_size: Option<usize>,
-
-    // When we are starting a new replica in hot standby mode,
-    // we need to know if the primary is running.
-    // This is used to determine if replica should wait for
-    // RUNNING_XACTS from primary or not.
-    pub primary_is_running: Option<bool>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,7 +18,6 @@ enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
-humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
 chrono.workspace = true
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,129 +0,0 @@
-use std::str::FromStr;
-
-/// Request/response types for the storage controller
-/// API (`/control/v1` prefix).  Implemented by the server
-/// in [`attachment_service::http`]
-use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
-
-use crate::{models::ShardParameters, shard::TenantShardId};
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-    pub generation: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponse {
-    pub shards: Vec<TenantCreateResponseShard>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeRegisterRequest {
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeConfigureRequest {
-    pub node_id: NodeId,
-
-    pub availability: Option<NodeAvailability>,
-    pub scheduling: Option<NodeSchedulingPolicy>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantLocateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantLocateResponse {
-    pub shards: Vec<TenantLocateResponseShard>,
-    pub shard_params: ShardParameters,
-}
-
-/// Explicitly migrating a particular shard is a low level operation
-/// TODO: higher level "Reschedule tenant" operation where the request
-/// specifies some constraints, e.g. asking it to get off particular node(s)
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
-    pub node_id: NodeId,
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeAvailability {
-    // Normal, happy state
-    Active,
-    // Offline: Tenants shouldn't try to attach here, but they may assume that their
-    // secondary locations on this node still exist.  Newly added nodes are in this
-    // state until we successfully contact them.
-    Offline,
-}
-
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeSchedulingPolicy {
-    Active,
-    Filling,
-    Pause,
-    Draining,
-}
-
-impl FromStr for NodeSchedulingPolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "filling" => Ok(Self::Filling),
-            "pause" => Ok(Self::Pause),
-            "draining" => Ok(Self::Draining),
-            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
-        }
-    }
-}
-
-impl From<NodeSchedulingPolicy> for String {
-    fn from(value: NodeSchedulingPolicy) -> String {
-        use NodeSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Filling => "filling",
-            Pause => "pause",
-            Draining => "draining",
-        }
-        .to_string()
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateResponse {}
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,7 +307,6 @@ impl KeySpaceRandomAccum {
    }
 }

-#[inline(always)]
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
    let start = key_range.start;
    let end = key_range.end;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,14 +2,13 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

-pub mod controller_api;
+/// Public API types
+pub mod control_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
-/// Public API types
-pub mod upcall_api;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,7 +1,4 @@
 pub mod partitioning;
-pub mod utilization;
-
-pub use utilization::PageserverUtilization;

 use std::{
    collections::HashMap,
@@ -14,6 +11,7 @@ use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
+use strum_macros;
 use utils::{
    completion,
    history_buffer::HistoryBufferWithDropCounter,
@@ -271,8 +269,6 @@ pub struct TenantConfig {
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
-    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -284,6 +280,7 @@ pub struct TenantConfig {
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
+    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
@@ -307,13 +304,6 @@ impl EvictionPolicy {
    }
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
-pub enum CompactionAlgorithm {
-    Legacy,
-    Tiered,
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -347,7 +337,7 @@ impl ThrottleConfig {
    }
    /// The requests per second allowed  by the given config.
    pub fn steady_rps(&self) -> f64 {
-        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
+        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
    }
 }

@@ -435,8 +425,6 @@ pub struct TenantShardLocation {
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigResponse {
    pub shards: Vec<TenantShardLocation>,
-    // If the shards' ShardCount count is >1, stripe_size will be set.
-    pub stripe_size: Option<ShardStripeSize>,
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -1078,6 +1066,7 @@ impl PagestreamBeMessage {

 #[cfg(test)]
 mod tests {
+    use bytes::Buf;
    use serde_json::json;

    use super::*;
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,70 +0,0 @@
-use std::time::SystemTime;
-
-/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
-/// the next tenant.
-///
-/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
-///
-/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
-/// not handle full u64 values properly.
-#[derive(serde::Serialize, Debug)]
-pub struct PageserverUtilization {
-    /// Used disk space
-    #[serde(serialize_with = "ser_saturating_u63")]
-    pub disk_usage_bytes: u64,
-    /// Free disk space
-    #[serde(serialize_with = "ser_saturating_u63")]
-    pub free_space_bytes: u64,
-    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
-    #[serde(serialize_with = "ser_saturating_u63")]
-    pub utilization_score: u64,
-    /// When was this snapshot captured, pageserver local time.
-    ///
-    /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(serialize_with = "ser_rfc3339_millis")]
-    pub captured_at: SystemTime,
-}
-
-fn ser_rfc3339_millis<S: serde::Serializer>(
-    ts: &SystemTime,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
-}
-
-/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
-///
-/// Instead of newtype, use this because a newtype would get require handling deserializing values
-/// with the highest bit set which is properly parsed by serde formats, but would create a
-/// conundrum on how to handle and again serialize such values at type level. It will be a few
-/// years until we can use more than `i64::MAX` bytes on a disk.
-fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
-    const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
-
-    let value = (*value).min(MAX_FORMAT_INT64);
-
-    serializer.serialize_u64(value)
-}
-
-#[cfg(test)]
-mod tests {
-    use std::time::Duration;
-
-    use super::*;
-
-    #[test]
-    fn u64_max_is_serialized_as_u63_max() {
-        let doc = PageserverUtilization {
-            disk_usage_bytes: u64::MAX,
-            free_space_bytes: 0,
-            utilization_score: u64::MAX,
-            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
-        };
-
-        let s = serde_json::to_string(&doc).unwrap();
-
-        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
-
-        assert_eq!(s, expected);
-    }
-}
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
+use thiserror;
 use utils::id::TenantId;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -655,7 +656,10 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke

 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};

    use super::*;

--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -6,6 +6,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
+use futures::pin_mut;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
@@ -377,7 +378,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        &mut self,
        cx: &mut std::task::Context<'_>,
    ) -> Poll<Result<(), std::io::Error>> {
-        let flush_fut = std::pin::pin!(self.flush());
+        let flush_fut = self.flush();
+        pin_mut!(flush_fut);
        flush_fut.poll(cx)
    }

--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -80,9 +80,6 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
 pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
 pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;

-// From standbydefs.h
-pub const XLOG_RUNNING_XACTS: u8 = 0x10;
-
 // From srlu.h
 pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
 pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -119,6 +119,11 @@ pub fn generate_pg_control(
    // Generate new pg_control needed for bootstrap
    checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;

+    //reset some fields we don't want to preserve
+    //TODO Check this.
+    //We may need to determine the value from twophase data.
+    checkpoint.oldestActiveXid = 0;
+
    //save new values in pg_control
    pg_control.checkPoint = 0;
    pg_control.checkPointCopy = checkpoint;
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -623,7 +623,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use bytes::Bytes;
    use camino_tempfile::tempdir;
+    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

    async fn read_and_check_metadata(
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1040,7 +1040,7 @@ mod tests {
            Some("test/prefix/"),
            Some("/test/prefix/"),
        ];
-        let expected_outputs = [
+        let expected_outputs = vec![
            vec!["", "some/path", "some/path"],
            vec!["/", "/some/path", "/some/path"],
            vec![
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,6 +1,7 @@
 // For details about authentication see docs/authentication.md

 use arc_swap::ArcSwap;
+use serde;
 use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
@@ -31,8 +32,6 @@ pub enum Scope {
    // The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    // Allows access to control plane managment API and some storage controller endpoints.
-    Admin,
 }

 /// JWT payload. See docs/authentication.md for the format
@@ -205,11 +204,12 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        //   "scope": "tenant",
        //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
        //   "iss": "neon.controlplane",
+        //   "exp": 1709200879,
        //   "iat": 1678442479
        // }
        // ```
        //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -4,9 +4,7 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion {
-    _token: TaskTrackerToken,
-}
+pub struct Completion(TaskTrackerToken);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -51,5 +49,5 @@ pub fn channel() -> (Completion, Barrier) {
    tracker.close();

    let token = tracker.token();
-    (Completion { _token: token }, Barrier(tracker))
+    (Completion(token), Barrier(tracker))
 }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
    borrow::Cow,
    fs::{self, File},
-    io::{self, Write},
+    io,
 };

 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,48 +161,6 @@ pub async fn durable_rename(
    Ok(())
 }

-/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
-///
-/// The file is first written to the specified `tmp_path`, and in a second
-/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
-/// and atomic rename guarantee that, if we crash at any point, there will never
-/// be a partially written file at `final_path` (but maybe at `tmp_path`).
-///
-/// Callers are responsible for serializing calls of this function for a given `final_path`.
-/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
-/// be no error and the content of `final_path` will be the "winner" caller's `content`.
-/// I.e., the atomticity guarantees still hold.
-pub fn overwrite(
-    final_path: &Utf8Path,
-    tmp_path: &Utf8Path,
-    content: &[u8],
-) -> std::io::Result<()> {
-    let Some(final_path_parent) = final_path.parent() else {
-        return Err(std::io::Error::from_raw_os_error(
-            nix::errno::Errno::EINVAL as i32,
-        ));
-    };
-    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
-    let mut file = std::fs::OpenOptions::new()
-        .write(true)
-        // Use `create_new` so that, if we race with ourselves or something else,
-        // we bail out instead of causing damage.
-        .create_new(true)
-        .open(tmp_path)?;
-    file.write_all(content)?;
-    file.sync_all()?;
-    drop(file); // don't keep the fd open for longer than we have to
-
-    std::fs::rename(tmp_path, final_path)?;
-
-    let final_parent_dirfd = std::fs::OpenOptions::new()
-        .read(true)
-        .open(final_path_parent)?;
-
-    final_parent_dirfd.sync_all()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -45,7 +45,7 @@ impl Generation {
        Self::Broken
    }

-    pub const fn new(v: u32) -> Self {
+    pub fn new(v: u32) -> Self {
        Self::Valid(v)
    }

--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tracing::{debug, info, info_span, warn, Instrument};
+use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
 use std::str::FromStr;
@@ -156,10 +156,6 @@ pub struct ChannelWriter {
    buffer: BytesMut,
    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
    written: usize,
-    /// Time spent waiting for the channel to make progress. It is not the same as time to upload a
-    /// buffer because we cannot know anything about that, but this should allow us to understand
-    /// the actual time taken without the time spent `std::thread::park`ed.
-    wait_time: std::time::Duration,
 }

 impl ChannelWriter {
@@ -172,7 +168,6 @@ impl ChannelWriter {
            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
            tx,
            written: 0,
-            wait_time: std::time::Duration::ZERO,
        }
    }

@@ -185,8 +180,6 @@ impl ChannelWriter {
        tracing::trace!(n, "flushing");
        let ready = self.buffer.split().freeze();

-        let wait_started_at = std::time::Instant::now();
-
        // not ideal to call from blocking code to block_on, but we are sure that this
        // operation does not spawn_blocking other tasks
        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -199,9 +192,6 @@ impl ChannelWriter {
            // sending it to the client.
            Ok(())
        });
-
-        self.wait_time += wait_started_at.elapsed();
-
        if res.is_err() {
            return Err(std::io::ErrorKind::BrokenPipe.into());
        }
@@ -212,10 +202,6 @@ impl ChannelWriter {
    pub fn flushed_bytes(&self) -> usize {
        self.written
    }
-
-    pub fn wait_time(&self) -> std::time::Duration {
-        self.wait_time
-    }
 }

 impl std::io::Write for ChannelWriter {
@@ -266,52 +252,22 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body

    let span = info_span!("blocking");
    tokio::task::spawn_blocking(move || {
-        // there are situations where we lose scraped metrics under load, try to gather some clues
-        // since all nodes are queried this, keep the message count low.
-        let spawned_at = std::time::Instant::now();
-
        let _span = span.entered();
-
        let metrics = metrics::gather();
-
-        let gathered_at = std::time::Instant::now();
-
        let res = encoder
            .encode(&metrics, &mut writer)
            .and_then(|_| writer.flush().map_err(|e| e.into()));

-        // this instant is not when we finally got the full response sent, sending is done by hyper
-        // in another task.
-        let encoded_at = std::time::Instant::now();
-
-        let spawned_in = spawned_at - started_at;
-        let collected_in = gathered_at - spawned_at;
-        // remove the wait time here in case the tcp connection was clogged
-        let encoded_in = encoded_at - gathered_at - writer.wait_time();
-        let total = encoded_at - started_at;
-
        match res {
            Ok(()) => {
                tracing::info!(
                    bytes = writer.flushed_bytes(),
-                    total_ms = total.as_millis(),
-                    spawning_ms = spawned_in.as_millis(),
-                    collection_ms = collected_in.as_millis(),
-                    encoding_ms = encoded_in.as_millis(),
+                    elapsed_ms = started_at.elapsed().as_millis(),
                    "responded /metrics"
                );
            }
            Err(e) => {
-                // there is a chance that this error is not the BrokenPipe we generate in the writer
-                // for "closed connection", but it is highly unlikely.
-                tracing::warn!(
-                    after_bytes = writer.flushed_bytes(),
-                    total_ms = total.as_millis(),
-                    spawning_ms = spawned_in.as_millis(),
-                    collection_ms = collected_in.as_millis(),
-                    encoding_ms = encoded_in.as_millis(),
-                    "failed to write out /metrics response: {e:?}"
-                );
+                tracing::warn!("failed to write out /metrics response: {e:#}");
                // semantics of this error are quite... unclear. we want to error the stream out to
                // abort the response to somehow notify the client that we failed.
                //
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,6 +415,7 @@ mod tests {

    use super::*;

+    use serde::ser::Serialize;
    use serde_assert::{Deserializer, Serializer, Token, Tokens};

    #[test]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]

-use std::cmp::{Eq, Ordering};
+use std::cmp::{Eq, Ordering, PartialOrd};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
@@ -249,6 +249,7 @@ where
 mod tests {
    use super::*;
    use std::sync::Arc;
+    use std::time::Duration;

    impl MonotonicCounter<i32> for i32 {
        fn cnt_advance(&mut self, val: i32) {
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -221,7 +221,7 @@ impl RcuWaitList {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use std::sync::Mutex;
+    use std::sync::{Arc, Mutex};
    use std::time::Duration;

    #[tokio::test]
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -239,6 +239,7 @@ mod tests {
    use std::{
        convert::Infallible,
        pin::{pin, Pin},
+        sync::atomic::{AtomicUsize, Ordering},
        time::Duration,
    };

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -73,7 +73,6 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
-pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -251,30 +251,21 @@ impl Client {
        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<std::time::Duration>,
-        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
            tenant_id: tenant_shard_id,
            config,
        };
-
-        let mut path = reqwest::Url::parse(&format!(
+        let path = format!(
            "{}/v1/tenant/{}/location_config",
            self.mgmt_api_endpoint, tenant_shard_id
-        ))
-        // Should always work: mgmt_api_endpoint is configuration, not user input.
-        .expect("Cannot build URL");
-
-        if lazy {
-            path.query_pairs_mut().append_pair("lazy", "true");
-        }
-
-        if let Some(flush_ms) = flush_ms {
-            path.query_pairs_mut()
-                .append_pair("flush_ms", &format!("{}", flush_ms.as_millis()));
-        }
-
-        self.request(Method::PUT, path, &req_body).await?;
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+        self.request(Method::PUT, &path, &req_body).await?;
        Ok(())
    }

--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -1,54 +0,0 @@
-[package]
-name = "pageserver_compaction"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[features]
-default = []
-
-[dependencies]
-anyhow.workspace = true
-async-compression.workspace = true
-async-stream.workspace = true
-async-trait.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
-clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
-futures.workspace = true
-git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
-itertools.workspace = true
-once_cell.workspace = true
-pageserver_api.workspace = true
-pin-project-lite.workspace = true
-rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
-tracing.workspace = true
-tracing-error.workspace = true
-tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
-[dev-dependencies]
-criterion.workspace = true
-hex-literal.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
--- a/pageserver/compaction/TODO.md
+++ b/pageserver/compaction/TODO.md
@@ -1,51 +0,0 @@
-# TODO
-
- If the key space can be perfectly partitioned at some key, perform planning on each
-  partition separately. For example, if we are compacting a level with layers like this:
-
-  ```
-              :
-  +--+ +----+ :  +------+
-  |  | |    | :  |      |
-  +--+ +----+ :  +------+
-              :
-  +-----+ +-+ : +--------+
-  |     | | | : |        |
-  +-----+ +-+ : +--------+
-              :
-  ```
-
-  At the dotted line, there is a natural split in the key space, such that all
-  layers are either on the left or the right of it. We can compact the
-  partitions separately.  We could choose to create image layers for one
-  partition but not the other one, for example.
-
- All the layers don't have to be exactly the same size, we can choose to cut a
-  layer short or stretch it a little larger than the target size, if it helps
-  the overall system. We can help perfect partitions (see previous bullet point)
-  to happen more frequently, by choosing the cut points wisely. For example, try
-  to cut layers at boundaries of underlying image layers. And "snap to grid",
-  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
-
- Avoid rewriting layers when we'd just create an identical layer to an input
-  layer.
-
- Parallelism. The code is already split up into planning and execution, so that
-  we first split up the compaction work into "Jobs", and then execute them.
-  It would be straightforward to execute multiple jobs in parallel.
-
- Materialize extra pages in delta layers during compaction. This would reduce
-  read amplification. There has been the idea of partial image layers. Materializing
-  extra pages in the delta layers achieve the same goal, without introducing a new
-  concept.
-
-## Simulator
-
- Expand the simulator for more workloads
- Automate a test suite that runs the simluator with different workloads and
-  spits out a table of results
- Model read amplification
- More sanity checking. One idea is to keep a reference count of each
-  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
-  a MockRecord that is newer than PITR horizon is completely dropped. That would
-  indicate that the record was lost.
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,214 +0,0 @@
-use clap::{Parser, Subcommand};
-use pageserver_compaction::simulator::MockTimeline;
-use rand::Rng;
-use std::io::Write;
-use std::path::{Path, PathBuf};
-use std::sync::OnceLock;
-
-use utils::project_git_version;
-
-project_git_version!(GIT_VERSION);
-
-#[derive(Parser)]
-#[command(
-    version = GIT_VERSION,
-    about = "Neon Pageserver compaction simulator",
-    long_about = "A developer tool to visualize and test compaction"
-)]
-#[command(propagate_version = true)]
-struct CliOpts {
-    #[command(subcommand)]
-    command: Commands,
-}
-
-#[derive(Subcommand)]
-enum Commands {
-    RunSuite,
-    Simulate(SimulateCmd),
-}
-
-#[derive(Clone, clap::ValueEnum)]
-enum Distribution {
-    Uniform,
-    HotCold,
-}
-
-/// Read and update pageserver metadata file
-#[derive(Parser)]
-struct SimulateCmd {
-    distribution: Distribution,
-
-    /// Number of records to digest
-    num_records: u64,
-    /// Record length
-    record_len: u64,
-
-    // Logical database size in MB
-    logical_size: u64,
-}
-
-async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
-    let mut executor = MockTimeline::new();
-
-    // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
-    //let key_range = u64::MIN..u64::MAX;
-    println!(
-        "starting simulation with key range {:016X}-{:016X}",
-        key_range.start, key_range.end
-    );
-
-    // helper function to print progress indicator
-    let print_progress = |i| -> anyhow::Result<()> {
-        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
-            print!(
-                "\ringested {} / {} records, {} MiB / {} MiB...",
-                i + 1,
-                cmd.num_records,
-                (i + 1) * cmd.record_len / (1_000_000),
-                cmd.num_records * cmd.record_len / (1_000_000),
-            );
-            std::io::stdout().flush()?;
-        }
-        Ok(())
-    };
-
-    match cmd.distribution {
-        Distribution::Uniform => {
-            for i in 0..cmd.num_records {
-                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-        Distribution::HotCold => {
-            let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
-            let hot_key_range = 0..splitpoint;
-            let cold_key_range = splitpoint..key_range.end;
-
-            for i in 0..cmd.num_records {
-                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
-                    &hot_key_range
-                } else {
-                    &cold_key_range
-                };
-                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-    }
-    println!("done!");
-    executor.flush_l0();
-    executor.compact_if_needed().await?;
-    let stats = executor.stats()?;
-
-    // Print the stats to stdout, and also to a file
-    print!("{stats}");
-    std::fs::write(results_path.join("stats.txt"), stats)?;
-
-    let animation_path = results_path.join("compaction-animation.html");
-    executor.draw_history(std::fs::File::create(&animation_path)?)?;
-    println!(
-        "animation: file://{}",
-        animation_path.canonicalize()?.display()
-    );
-
-    Ok(())
-}
-
-async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
-    std::fs::create_dir(results_path)?;
-
-    set_log_file(File::create(results_path.join("log"))?);
-    let result = simulate(workload, results_path).await;
-    set_log_stdout();
-    result
-}
-
-async fn run_suite() -> anyhow::Result<()> {
-    let top_results_path = PathBuf::from(format!(
-        "compaction-suite-results.{}",
-        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
-    ));
-    std::fs::create_dir(&top_results_path)?;
-
-    let workload = SimulateCmd {
-        distribution: Distribution::Uniform,
-        // Generate 20 GB of WAL
-        record_len: 1_000,
-        num_records: 20_000_000,
-        // Logical size 5 GB
-        logical_size: 5_000,
-    };
-
-    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
-
-    println!(
-        "All tests finished. Results in {}",
-        top_results_path.display()
-    );
-    Ok(())
-}
-
-use std::fs::File;
-use std::io::Stdout;
-use std::sync::Mutex;
-use tracing_subscriber::fmt::writer::EitherWriter;
-use tracing_subscriber::fmt::MakeWriter;
-
-static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
-fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
-    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
-}
-
-fn set_log_file(f: File) {
-    *get_log_output().lock().unwrap() = EitherWriter::A(f);
-}
-
-fn set_log_stdout() {
-    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
-}
-
-fn init_logging() -> anyhow::Result<()> {
-    // We fall back to printing all spans at info-level or above if
-    // the RUST_LOG environment variable is not set.
-    let rust_log_env_filter = || {
-        tracing_subscriber::EnvFilter::try_from_default_env()
-            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
-    };
-
-    // NB: the order of the with() calls does not matter.
-    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
-    use tracing_subscriber::prelude::*;
-    tracing_subscriber::registry()
-        .with({
-            let log_layer = tracing_subscriber::fmt::layer()
-                .with_target(false)
-                .with_ansi(false)
-                .with_writer(|| get_log_output().make_writer());
-            log_layer.with_filter(rust_log_env_filter())
-        })
-        .init();
-
-    Ok(())
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = CliOpts::parse();
-
-    init_logging()?;
-
-    match cli.command {
-        Commands::Simulate(cmd) => {
-            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
-        }
-        Commands::RunSuite => {
-            run_suite().await?;
-        }
-    };
-    Ok(())
-}
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -1,857 +0,0 @@
-//! # Tiered compaction algorithm.
-//!
-//! Read all the input delta files, and write a new set of delta files that
-//! include all the input WAL records. See retile_deltas().
-//!
-//! In a "normal" LSM tree, you get to remove any values that are overwritten by
-//! later values, but in our system, we keep all the history. So the reshuffling
-//! doesn't remove any garbage, it just reshuffles the records to reduce read
-//! amplification, i.e. the number of files that you need to access to find the
-//! WAL records for a given key.
-//!
-//! If the new delta files would be very "narrow", i.e. each file would cover
-//! only a narrow key range, then we create a new set of image files
-//! instead. The current threshold is that if the estimated total size of the
-//! image layers is smaller than the size of the deltas, then we create image
-//! layers. That amounts to 2x storage amplification, and it means that the
-//! distance of image layers in LSN dimension is roughly equal to the logical
-//! database size. For example, if the logical database size is 10 GB, we would
-//! generate new image layers every 10 GB of WAL.
-use futures::StreamExt;
-use tracing::{debug, info};
-
-use std::collections::{HashSet, VecDeque};
-use std::ops::Range;
-
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
-use crate::interface::*;
-use utils::lsn::Lsn;
-
-use crate::identify_levels::identify_level;
-
-/// Main entry point to compaction.
-///
-/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
-/// everything below that point, that needs compaction. The cutoff LSN must
-/// partition the layers so that there are no layers that span across that
-/// LSN. To start compaction at the top of the tree, pass the end LSN of the
-/// written last L0 layer.
-pub async fn compact_tiered<E: CompactionJobExecutor>(
-    executor: &mut E,
-    end_lsn: Lsn,
-    target_file_size: u64,
-    fanout: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
-    // Start at L0
-    let mut current_level_no = 0;
-    let mut current_level_target_height = target_file_size;
-    loop {
-        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
-        let all_layers = executor
-            .get_layers(
-                &(E::Key::MIN..E::Key::MAX),
-                &(Lsn(u64::MIN)..end_lsn + 1),
-                ctx,
-            )
-            .await?;
-        info!(
-            "Compacting L{}, total # of layers: {}",
-            current_level_no,
-            all_layers.len()
-        );
-
-        // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level spans an LSN range up to 1.75x target file
-        // size. That should give us enough slop that if we created a slightly
-        // oversized L0 layer, e.g. because flushing the in-memory layer was
-        // delayed for some reason, we don't consider the oversized layer to
-        // belong to L1. But not too much slop, that we don't accidentally
-        // "skip" levels.
-        let max_height = (current_level_target_height as f64 * 1.75) as u64;
-        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
-            break;
-        };
-
-        // Calculate the height of this level. If the # of tiers exceeds the
-        // fanout parameter, it's time to compact it.
-        let depth = level.depth();
-        info!(
-            "Level {} identified as LSN range {}-{}: depth {}",
-            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
-        );
-        for l in &level.layers {
-            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
-        }
-        if depth < fanout {
-            debug!(
-                level = current_level_no,
-                depth = depth,
-                fanout,
-                "too few deltas to compact"
-            );
-            break;
-        }
-
-        compact_level(
-            &level.lsn_range,
-            &level.layers,
-            executor,
-            target_file_size,
-            ctx,
-        )
-        .await?;
-        if target_file_size == u64::MAX {
-            break;
-        }
-        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
-    }
-    Ok(())
-}
-
-async fn compact_level<E: CompactionJobExecutor>(
-    lsn_range: &Range<Lsn>,
-    layers: &[E::Layer],
-    executor: &mut E,
-    target_file_size: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<bool> {
-    let mut layer_fragments = Vec::new();
-    for l in layers {
-        layer_fragments.push(LayerFragment::new(l.clone()));
-    }
-
-    let mut state = LevelCompactionState {
-        target_file_size,
-        _lsn_range: lsn_range.clone(),
-        layers: layer_fragments,
-        jobs: Vec::new(),
-        job_queue: Vec::new(),
-        next_level: false,
-        executor,
-    };
-
-    let first_job = CompactionJob {
-        key_range: E::Key::MIN..E::Key::MAX,
-        lsn_range: lsn_range.clone(),
-        strategy: CompactionStrategy::Divide,
-        input_layers: state
-            .layers
-            .iter()
-            .enumerate()
-            .map(|i| LayerId(i.0))
-            .collect(),
-        completed: false,
-    };
-
-    state.jobs.push(first_job);
-    state.job_queue.push(JobId(0));
-    state.execute(ctx).await?;
-
-    info!(
-        "compaction completed! Need to process next level: {}",
-        state.next_level
-    );
-
-    Ok(state.next_level)
-}
-
-/// Blackboard that keeps track of the state of all the jobs and work remaining
-struct LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    // parameters
-    target_file_size: u64,
-
-    _lsn_range: Range<Lsn>,
-    layers: Vec<LayerFragment<E>>,
-
-    // job queue
-    jobs: Vec<CompactionJob<E>>,
-    job_queue: Vec<JobId>,
-
-    /// If false, no need to compact levels below this
-    next_level: bool,
-
-    /// Interface to the outside world
-    executor: &'a mut E,
-}
-
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct LayerId(usize);
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct JobId(usize);
-
-struct PendingJobSet {
-    pending: HashSet<JobId>,
-    completed: HashSet<JobId>,
-}
-
-impl PendingJobSet {
-    fn new() -> Self {
-        PendingJobSet {
-            pending: HashSet::new(),
-            completed: HashSet::new(),
-        }
-    }
-
-    fn complete_job(&mut self, job_id: JobId) {
-        self.pending.remove(&job_id);
-        self.completed.insert(job_id);
-    }
-
-    fn all_completed(&self) -> bool {
-        self.pending.is_empty()
-    }
-}
-
-// When we decide to rewrite a set of layers, LayerFragment is used to keep
-// track which new layers supersede an old layer. When all the stakeholder jobs
-// have completed, this layer can be deleted.
-struct LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    layer: E::Layer,
-
-    // If we will write new layers to replace this one, this keeps track of the
-    // jobs that need to complete before this layer can be deleted. As the jobs
-    // complete, they are moved from 'pending' to 'completed' set. Once the
-    // 'pending' set becomes empty, the layer can be deleted.
-    //
-    // If None, this layer is not rewritten and must not be deleted.
-    deletable_after: Option<PendingJobSet>,
-
-    deleted: bool,
-}
-
-impl<E> LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    fn new(layer: E::Layer) -> Self {
-        LayerFragment {
-            layer,
-            deletable_after: None,
-            deleted: false,
-        }
-    }
-}
-
-#[derive(PartialEq)]
-enum CompactionStrategy {
-    Divide,
-    CreateDelta,
-    CreateImage,
-}
-
-struct CompactionJob<E: CompactionJobExecutor> {
-    key_range: Range<E::Key>,
-    lsn_range: Range<Lsn>,
-
-    strategy: CompactionStrategy,
-
-    input_layers: Vec<LayerId>,
-
-    completed: bool,
-}
-
-impl<'a, E> LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    /// Main loop of the executor.
-    ///
-    /// In each iteration, we take the next job from the queue, and execute it.
-    /// The execution might add new jobs to the queue. Keep going until the
-    /// queue is empty.
-    ///
-    /// Initially, the job queue consists of one Divide job over the whole
-    /// level. On first call, it is divided into smaller jobs.
-    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
-        while let Some(next_job_id) = self.job_queue.pop() {
-            info!("executing job {}", next_job_id.0);
-            self.execute_job(next_job_id, ctx).await?;
-        }
-
-        // all done!
-        Ok(())
-    }
-
-    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        match job.strategy {
-            CompactionStrategy::Divide => {
-                self.divide_job(job_id, ctx).await?;
-                Ok(())
-            }
-            CompactionStrategy::CreateDelta => {
-                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-                let mut layer_ids: Vec<LayerId> = Vec::new();
-                for layer_id in &job.input_layers {
-                    let layer = &self.layers[layer_id.0].layer;
-                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
-                        deltas.push(dl.clone());
-                        layer_ids.push(*layer_id);
-                    }
-                }
-
-                self.executor
-                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // did we complete any fragments?
-                for layer_id in layer_ids {
-                    let l = &mut self.layers[layer_id.0];
-                    if let Some(deletable_after) = l.deletable_after.as_mut() {
-                        deletable_after.complete_job(job_id);
-                        if deletable_after.all_completed() {
-                            self.executor.delete_layer(&l.layer, ctx).await?;
-                            l.deleted = true;
-                        }
-                    }
-                }
-
-                self.next_level = true;
-
-                Ok(())
-            }
-            CompactionStrategy::CreateImage => {
-                self.executor
-                    .create_image(job.lsn_range.end, &job.key_range, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // TODO: we could check if any layers < PITR horizon became deletable
-                Ok(())
-            }
-        }
-    }
-
-    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
-        let job_id = JobId(self.jobs.len());
-        self.jobs.push(job);
-        self.job_queue.push(job_id);
-        job_id
-    }
-
-    /// Take a partition of the key space, and decide how to compact it.
-    ///
-    /// TODO: Currently, this is called exactly once for the level, and we
-    /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of deltas. In the future, this should try to partition
-    /// the key space, and make the decision separately for each partition.
-    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Check for dummy cases
-        if job.input_layers.is_empty() {
-            return Ok(());
-        }
-
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Would it be better to create images for this partition?
-        // Decide based on the average density of the level
-        let keyspace_size = keyspace_total_size(
-            &self
-                .executor
-                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-                .await?,
-        ) * 8192;
-
-        let wal_size = job
-            .input_layers
-            .iter()
-            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
-            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
-            .sum::<u64>();
-        if keyspace_size < wal_size {
-            // seems worth it
-            info!(
-                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
-                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
-            );
-            self.cover_with_images(job_id, ctx).await
-        } else {
-            // do deltas
-            info!(
-                "coverage not worth it, keyspace_size {}, wal_size {}",
-                keyspace_size, wal_size
-            );
-            self.retile_deltas(job_id, ctx).await
-        }
-    }
-
-    // LSN
-    //  ^
-    //  |
-    //  |                          ###|###|#####
-    //  | +--+-----+--+            +--+-----+--+
-    //  | |  |     |  |            |  |     |  |
-    //  | +--+--+--+--+            +--+--+--+--+
-    //  | |     |     |            |     |     |
-    //  | +---+-+-+---+     ==>    +---+-+-+---+
-    //  | |   |   |   |            |   |   |   |
-    //  | +---+-+-++--+            +---+-+-++--+
-    //  | |     |  |  |            |     |  |  |
-    //  | +-----+--+--+            +-----+--+--+
-    //  |
-    //  +--------------> key
-    //
-    async fn cover_with_images(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // XXX: do we still need the "holes" stuff?
-
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let keyspace = self
-            .executor
-            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-            .await?;
-
-        let mut window = KeyspaceWindow::new(
-            E::Key::MIN..E::Key::MAX,
-            keyspace,
-            self.target_file_size / 8192,
-        );
-        while let Some(key_range) = window.choose_next_image() {
-            new_jobs.push(CompactionJob::<E> {
-                key_range,
-                lsn_range: job.lsn_range.clone(),
-                strategy: CompactionStrategy::CreateImage,
-                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
-                completed: false,
-            });
-        }
-
-        for j in new_jobs.into_iter().rev() {
-            let _job_id = self.push_job(j);
-
-            // TODO: image layers don't let us delete anything. unless < PITR horizon
-            //let j = &self.jobs[job_id.0];
-            // for layer_id in j.input_layers.iter() {
-            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
-            //}
-        }
-
-        Ok(())
-    }
-
-    // Merge the contents of all the input delta layers into a new set
-    // of delta layers, based on the current partitioning.
-    //
-    // We split the new delta layers on the key dimension. We iterate through
-    // the key space, and for each key, check if including the next key to the
-    // current output layer we're building would cause the layer to become too
-    // large. If so, dump the current output layer and start new one.  It's
-    // possible that there is a single key with so many page versions that
-    // storing all of them in a single layer file would be too large. In that
-    // case, we also split on the LSN dimension.
-    //
-    // LSN
-    //  ^
-    //  |
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    //
-    // If one key (X) has a lot of page versions:
-    //
-    // LSN
-    //  ^
-    //  |                                 (X)
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  +--+  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  +--+  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    // TODO: this actually divides the layers into fixed-size chunks, not
-    // based on the partitioning.
-    //
-    // TODO: we should also opportunistically materialize and
-    // garbage collect what we can.
-    async fn retile_deltas(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Sweep the key space left to right, running an estimate of how much
-        // disk size and keyspace we have accumulated
-        //
-        // Once the disk size reaches the target threshold, stop and think.
-        // If we have accumulated only a narrow band of keyspace, create an
-        // image layer. Otherwise write a delta layer.
-
-        // FIXME: deal with the case of lots of values for same key
-
-        // FIXME: we are ignoring images here. Did we already divide the work
-        // so that we won't encounter them here?
-
-        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-        for layer_id in &job.input_layers {
-            let l = &self.layers[layer_id.0];
-            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
-                deltas.push(dl.clone());
-            }
-        }
-        // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
-        let mut all_in_window: bool = false;
-        let mut window = Window::new();
-        loop {
-            if all_in_window && window.elems.is_empty() {
-                // All done!
-                break;
-            }
-            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
-            {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
-                    all_in_window = true;
-                }
-            }
-        }
-
-        // All the input files are rewritten. Set up the tracking for when they can
-        // be deleted.
-        for layer_id in job.input_layers.iter() {
-            let l = &mut self.layers[layer_id.0];
-            assert!(l.deletable_after.is_none());
-            l.deletable_after = Some(PendingJobSet::new());
-        }
-        for j in new_jobs.into_iter().rev() {
-            let job_id = self.push_job(j);
-            let j = &self.jobs[job_id.0];
-            for layer_id in j.input_layers.iter() {
-                self.layers[layer_id.0]
-                    .deletable_after
-                    .as_mut()
-                    .unwrap()
-                    .pending
-                    .insert(job_id);
-            }
-        }
-
-        Ok(())
-    }
-}
-
-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
-struct KeyspaceWindow<K> {
-    head: KeyspaceWindowHead<K>,
-
-    start_pos: KeyspaceWindowPos<K>,
-}
-struct KeyspaceWindowHead<K> {
-    // overall key range to cover
-    key_range: Range<K>,
-
-    keyspace: Vec<Range<K>>,
-    target_keysize: u64,
-}
-
-#[derive(Clone)]
-struct KeyspaceWindowPos<K> {
-    end_key: K,
-
-    keyspace_idx: usize,
-
-    accum_keysize: u64,
-}
-impl<K: CompactionKey> KeyspaceWindowPos<K> {
-    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
-        self.keyspace_idx == w.keyspace.len()
-    }
-
-    // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
-        while self.accum_keysize < max_size && !self.reached_end(w) {
-            let curr_range = &w.keyspace[self.keyspace_idx];
-            if self.end_key < curr_range.start {
-                // skip over any unused space
-                self.end_key = curr_range.start;
-            }
-
-            // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
-            if (self.accum_keysize + distance as u64) < max_size {
-                // oh yeah, it fits
-                self.end_key = curr_range.end;
-                self.keyspace_idx += 1;
-                self.accum_keysize += distance as u64;
-            } else {
-                // advance within the range
-                let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
-                if (self.accum_keysize + distance as u64) < max_size {
-                    self.end_key = skip_key;
-                    self.accum_keysize += distance as u64;
-                } else {
-                    self.end_key = self.end_key.next();
-                    self.accum_keysize += 1;
-                }
-            }
-        }
-    }
-}
-
-impl<K> KeyspaceWindow<K>
-where
-    K: CompactionKey,
-{
-    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
-        assert!(keyspace.first().unwrap().start >= key_range.start);
-
-        let start_key = key_range.start;
-        let start_pos = KeyspaceWindowPos::<K> {
-            end_key: start_key,
-            keyspace_idx: 0,
-            accum_keysize: 0,
-        };
-        Self {
-            head: KeyspaceWindowHead::<K> {
-                key_range,
-                keyspace,
-                target_keysize,
-            },
-            start_pos,
-        }
-    }
-
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
-        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
-            // we've reached the end
-            return None;
-        }
-
-        let mut next_pos = self.start_pos.clone();
-        next_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + self.head.target_keysize,
-        );
-
-        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
-        // 1.25x target size
-        let mut end_pos = next_pos.clone();
-        end_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
-        );
-        if end_pos.reached_end(&self.head) {
-            // gobble up any unused keyspace between the last used key and end of the range
-            assert!(end_pos.end_key <= self.head.key_range.end);
-            end_pos.end_key = self.head.key_range.end;
-            next_pos = end_pos;
-        }
-
-        let start_key = self.start_pos.end_key;
-        self.start_pos = next_pos;
-        Some(start_key..self.start_pos.end_key)
-    }
-}
-
-// Take previous partitioning, based on the image layers below.
-//
-// Candidate is at the front:
-//
-// Consider stretching an image layer to next divider? If it's close enough,
-// that's the image candidate
-//
-// If it's too far, consider splitting at a reasonable point
-//
-// Is the image candidate smaller than the equivalent delta? If so,
-// split off the image. Otherwise, split off one delta.
-// Try to snap off the delta at a reasonable point
-
-struct WindowElement<K> {
-    start_key: K, // inclusive
-    last_key: K,  // inclusive
-    accum_size: u64,
-}
-
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-struct Window<K> {
-    elems: VecDeque<WindowElement<K>>,
-
-    // last key that was split off, inclusive
-    splitoff_key: Option<K>,
-    splitoff_size: u64,
-}
-
-impl<K> Window<K>
-where
-    K: CompactionKey,
-{
-    fn new() -> Self {
-        Self {
-            elems: VecDeque::new(),
-            splitoff_key: None,
-            splitoff_size: 0,
-        }
-    }
-
-    fn feed(&mut self, key: K, size: u64) {
-        let last_size;
-        if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
-            last_size = last.accum_size;
-        } else {
-            last_size = 0;
-        }
-        // This is a new key.
-        let elem = WindowElement {
-            start_key: key,
-            last_key: key,
-            accum_size: last_size + size,
-        };
-        self.elems.push_back(elem);
-    }
-
-    fn remain_size(&self) -> u64 {
-        self.elems.back().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn peek_size(&self) -> u64 {
-        self.elems.front().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn commit_upto(&mut self, mut upto: usize) {
-        while upto > 1 {
-            let popped = self.elems.pop_front().unwrap();
-            self.elems.front_mut().unwrap().start_key = popped.start_key;
-            upto -= 1;
-        }
-    }
-
-    fn find_size_split(&self, target_size: u64) -> usize {
-        self.elems
-            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
-    }
-
-    fn pop(&mut self) {
-        let first = self.elems.pop_front().unwrap();
-        self.splitoff_size = first.accum_size;
-
-        self.splitoff_key = Some(first.last_key);
-    }
-
-    // the difference between delta and image is that an image covers
-    // any unused keyspace before and after, while a delta tries to
-    // minimize that. TODO: difference not implemented
-    fn pop_delta(&mut self) -> Range<K> {
-        let first = self.elems.front().unwrap();
-        let key_range = first.start_key..first.last_key.next();
-
-        self.pop();
-        key_range
-    }
-
-    // Prerequisite: we have enough input in the window
-    //
-    // On return None, the caller should feed more data and call again
-    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
-        if has_more && self.elems.is_empty() {
-            // Starting up
-            return None;
-        }
-
-        // If we still have an undersized candidate, just keep going
-        while self.peek_size() < target_size {
-            if self.elems.len() > 1 {
-                self.commit_upto(2);
-            } else if has_more {
-                return None;
-            } else {
-                break;
-            }
-        }
-
-        // Ensure we have enough input in the window to make a good decision
-        if has_more && self.remain_size() < target_size * 5 / 4 {
-            return None;
-        }
-
-        // The candidate on the front is now large enough, for a delta.
-        // And we have enough data in the window to decide.
-
-        // If we're willing to stretch it up to 1.25 target size, could we
-        // gobble up the rest of the work? This avoids creating very small
-        // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
-            self.commit_upto(self.elems.len());
-        } else {
-            let delta_split_at = self.find_size_split(target_size);
-            self.commit_upto(delta_split_at);
-
-            // If it's still not large enough, request the caller to fill the window
-            if self.elems.len() == 1 && has_more {
-                return None;
-            }
-        }
-        Some(self.pop_delta())
-    }
-}
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -1,242 +0,0 @@
-//! This file contains generic utility functions over the interface types,
-//! which could be handy for any compaction implementation.
-use crate::interface::*;
-
-use futures::future::BoxFuture;
-use futures::{Stream, StreamExt};
-use itertools::Itertools;
-use pin_project_lite::pin_project;
-use std::collections::BinaryHeap;
-use std::collections::VecDeque;
-use std::future::Future;
-use std::ops::{DerefMut, Range};
-use std::pin::Pin;
-use std::task::{ready, Poll};
-
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
-where
-    K: CompactionKey,
-{
-    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
-}
-
-pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-    !(a.end <= b.start || b.end <= a.start)
-}
-
-pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
-    let x = std::mem::take(a);
-    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
-        .into_iter()
-        .kmerge_by(|a, b| a.start < b.start);
-    let mut ranges = Vec::new();
-    if let Some(first) = all_ranges_iter.next() {
-        let (mut start, mut end) = (first.start, first.end);
-
-        for r in all_ranges_iter {
-            assert!(r.start >= start);
-            if r.start > end {
-                ranges.push(start..end);
-                start = r.start;
-                end = r.end;
-            } else if r.end > end {
-                end = r.end;
-            }
-        }
-        ranges.push(start..end);
-    }
-    *a = ranges
-}
-
-pub fn intersect_keyspace<K: Ord + Clone + Copy>(
-    a: &CompactionKeySpace<K>,
-    r: &Range<K>,
-) -> CompactionKeySpace<K> {
-    let mut ranges: Vec<Range<K>> = Vec::new();
-
-    for x in a.iter() {
-        if x.end <= r.start {
-            continue;
-        }
-        if x.start >= r.end {
-            break;
-        }
-        ranges.push(x.clone())
-    }
-
-    // trim the ends
-    if let Some(first) = ranges.first_mut() {
-        first.start = std::cmp::max(first.start, r.start);
-    }
-    if let Some(last) = ranges.last_mut() {
-        last.end = std::cmp::min(last.end, r.end);
-    }
-    ranges
-}
-
-/// Create a stream that iterates through all DeltaEntrys among all input
-/// layers, in key-lsn order.
-///
-/// This is public because the create_delta() implementation likely wants to use this too
-/// TODO: move to a more shared place
-pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
-    layers: &'a [E::DeltaLayer],
-    ctx: &'a E::RequestContext,
-) -> MergeDeltaKeys<'a, E> {
-    // Use a binary heap to merge the layers. Each input layer is initially
-    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
-    // the layer's key range as the key. The first time a layer reaches the top
-    // of the heap, all the keys of the layer are loaded into a sorted vector.
-    //
-    // This helps to keep the memory usage reasonable: we only need to hold in
-    // memory the DeltaEntrys of the layers that overlap with the "current" key.
-    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
-    for l in layers {
-        heap.push(LazyLoadLayer::Unloaded(l));
-    }
-    MergeDeltaKeys {
-        heap,
-        ctx,
-        load_future: None,
-    }
-}
-
-enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
-    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
-    Unloaded(&'a E::DeltaLayer),
-}
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
-        match self {
-            Self::Loaded(entries) => entries.front().unwrap().key(),
-            Self::Unloaded(dl) => dl.key_range().start,
-        }
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
-    fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
-
-type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
-
-// Stream returned by `merge_delta_keys`
-pin_project! {
-#[allow(clippy::type_complexity)]
-pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
-    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
-
-    #[pin]
-    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
-
-    ctx: &'a E::RequestContext,
-}
-}
-
-impl<'a, E> Stream for MergeDeltaKeys<'a, E>
-where
-    E: CompactionJobExecutor + 'a,
-{
-    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
-
-    fn poll_next(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
-        let mut this = self.project();
-        loop {
-            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
-                // We are waiting for loading the keys to finish
-                match ready!(load_future.as_mut().poll(cx)) {
-                    Ok(entries) => {
-                        this.load_future.set(None);
-                        *this.heap.peek_mut().unwrap() =
-                            LazyLoadLayer::Loaded(VecDeque::from(entries));
-                    }
-                    Err(e) => {
-                        return Poll::Ready(Some(Err(e)));
-                    }
-                }
-            }
-
-            // If the topmost layer in the heap hasn't been loaded yet, start
-            // loading it. Otherwise return the next entry from it and update
-            // the layer's position in the heap (this decreaseKey operation is
-            // performed implicitly when `top` is dropped).
-            if let Some(mut top) = this.heap.peek_mut() {
-                match top.deref_mut() {
-                    LazyLoadLayer::Unloaded(ref mut l) => {
-                        let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(fut));
-                        continue;
-                    }
-                    LazyLoadLayer::Loaded(ref mut entries) => {
-                        let result = entries.pop_front().unwrap();
-                        if entries.is_empty() {
-                            std::collections::binary_heap::PeekMut::pop(top);
-                        }
-                        return Poll::Ready(Some(Ok(result)));
-                    }
-                }
-            } else {
-                return Poll::Ready(None);
-            }
-        }
-    }
-}
-
-// Accumulate values at key boundaries
-pub struct KeySize<K> {
-    pub key: K,
-    pub num_values: u64,
-    pub size: u64,
-}
-
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
-where
-    K: Eq,
-    I: Stream<Item = Result<D, E>>,
-    D: CompactionDeltaEntry<'a, K>,
-{
-    async_stream::try_stream! {
-        // Initialize the state from the first value
-        let mut input = std::pin::pin!(input);
-
-        if let Some(first) = input.next().await {
-            let first = first?;
-            let mut accum: KeySize<K> = KeySize {
-                key: first.key(),
-                num_values: 1,
-                size: first.size(),
-            };
-            while let Some(this) = input.next().await {
-                let this = this?;
-                if this.key() == accum.key {
-                    accum.size += this.size();
-                    accum.num_values += 1;
-                } else {
-                    yield accum;
-                    accum = KeySize {
-                        key: this.key(),
-                        num_values: 1,
-                        size: this.size(),
-                    };
-                }
-            }
-            yield accum;
-        }
-    }
-}
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,375 +0,0 @@
-//! An LSM tree consists of multiple levels, each exponentially larger than the
-//! previous level. And each level consists of multiple "tiers". With tiered
-//! compaction, a level is compacted when it has accumulated more than N tiers,
-//! forming one tier on the next level.
-//!
-//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
-//! we identify them by looking at the shapes of the layers. It's an easy task
-//! for a human, but it's not straightforward to come up with the exact
-//! rules. Especially if there are cases like interrupted, half-finished
-//! compactions, or highly skewed data distributions that have let us "skip"
-//! some levels. It's not critical to classify all cases correctly; at worst we
-//! delay some compaction work, and suffer from more read amplification, or we
-//! perform some unnecessary compaction work.
-//!
-//! `identify_level` performs that shape-matching.
-//!
-//! It returns a Level struct, which has `depth()` function to count the number
-//! of "tiers" in the level. The tier count is the max depth of stacked layers
-//! within the level. That's a good measure, because the point of compacting is
-//! to reduce read amplification, and the depth is what determines that.
-//!
-//! One interesting effect of this is that if we generate very small delta
-//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
-//! because they reach the target size, the L0 compaction will combine them to
-//! one larger file. But if the combined file is still smaller than the target
-//! file size, the file will still be considered to be part of L0 at the next
-//! iteration.
-
-use anyhow::bail;
-use std::collections::BTreeSet;
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-use crate::interface::*;
-
-use tracing::{info, trace};
-
-pub struct Level<L> {
-    pub lsn_range: Range<Lsn>,
-    pub layers: Vec<L>,
-}
-
-/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
-/// no layers that cross the boundary LSN.
-///
-/// A further restriction is that all layers in the returned partition cover at
-/// most 'lsn_max_size' LSN bytes.
-pub async fn identify_level<K, L>(
-    all_layers: Vec<L>,
-    end_lsn: Lsn,
-    lsn_max_size: u64,
-) -> anyhow::Result<Option<Level<L>>>
-where
-    K: CompactionKey,
-    L: CompactionLayer<K> + Clone,
-{
-    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
-    let mut layers = Vec::new();
-    for l in all_layers {
-        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
-            // shouldn't happen. Indicates that the caller passed a bogus
-            // end_lsn.
-            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
-        }
-        // include image layers sitting exacty at `end_lsn`.
-        let is_image = !l.is_delta();
-        if (is_image && l.lsn_range().start > end_lsn)
-            || (!is_image && l.lsn_range().start >= end_lsn)
-        {
-            continue;
-        }
-        layers.push(l);
-    }
-    // All the remaining layers either belong to this level, or are below it.
-    info!(
-        "identify level at {}, size {}, num layers below: {}",
-        end_lsn,
-        lsn_max_size,
-        layers.len()
-    );
-    if layers.is_empty() {
-        return Ok(None);
-    }
-
-    // Walk the ranges in LSN order.
-    //
-    // ----- end_lsn
-    //  |
-    //  |
-    //  v
-    //
-    layers.sort_by_key(|l| l.lsn_range().end);
-    let mut candidate_start_lsn = end_lsn;
-    let mut candidate_layers: Vec<L> = Vec::new();
-    let mut current_best_start_lsn = end_lsn;
-    let mut current_best_layers: Vec<L> = Vec::new();
-    let mut iter = layers.into_iter();
-    loop {
-        let Some(l) = iter.next_back() else {
-            // Reached end. Accept the last candidate
-            current_best_start_lsn = candidate_start_lsn;
-            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-            break;
-        };
-        trace!(
-            "inspecting {} for candidate {}, current best {}",
-            l.short_id(),
-            candidate_start_lsn,
-            current_best_start_lsn
-        );
-
-        let r = l.lsn_range();
-
-        // Image layers don't restrict our choice of cutoff LSN
-        if l.is_delta() {
-            // Is this candidate workable? In other words, are there any
-            // delta layers that span across this LSN
-            //
-            // Valid:                 Not valid:
-            //  +                     +
-            //  |                     | +
-            //  +  <- candidate       + |   <- candidate
-            //     +                    +
-            //     |
-            //     +
-            if r.end <= candidate_start_lsn {
-                // Hooray, there are no crossing LSNs. And we have visited
-                // through all the layers within candidate..end_lsn. The
-                // current candidate can be accepted.
-                current_best_start_lsn = r.end;
-                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-                candidate_start_lsn = r.start;
-            }
-
-            // Is it small enough to be considered part of this level?
-            if r.end.0 - r.start.0 > lsn_max_size {
-                // Too large, this layer belongs to next level. Stop.
-                trace!(
-                    "too large {}, size {} vs {}",
-                    l.short_id(),
-                    r.end.0 - r.start.0,
-                    lsn_max_size
-                );
-                break;
-            }
-
-            // If this crosses the candidate lsn, push it down.
-            if r.start < candidate_start_lsn {
-                trace!(
-                    "layer {} prevents from stopping at {}",
-                    l.short_id(),
-                    candidate_start_lsn
-                );
-                candidate_start_lsn = r.start;
-            }
-        }
-
-        // Include this layer in our candidate
-        candidate_layers.push(l);
-    }
-
-    Ok(if current_best_start_lsn == end_lsn {
-        // empty level
-        None
-    } else {
-        Some(Level {
-            lsn_range: current_best_start_lsn..end_lsn,
-            layers: current_best_layers,
-        })
-    })
-}
-
-impl<L> Level<L> {
-    /// Count the number of deltas stacked on each other.
-    pub fn depth<K>(&self) -> u64
-    where
-        K: CompactionKey,
-        L: CompactionLayer<K>,
-    {
-        struct Event<K> {
-            key: K,
-            layer_idx: usize,
-            start: bool,
-        }
-        let mut events: Vec<Event<K>> = Vec::new();
-        for (idx, l) in self.layers.iter().enumerate() {
-            events.push(Event {
-                key: l.key_range().start,
-                layer_idx: idx,
-                start: true,
-            });
-            events.push(Event {
-                key: l.key_range().end,
-                layer_idx: idx,
-                start: false,
-            });
-        }
-        events.sort_by_key(|e| (e.key, e.start));
-
-        // Sweep the key space left to right. Stop at each distinct key, and
-        // count the number of deltas on top of the highest image at that key.
-        //
-        // This is a little inefficient, as we walk through the active_set on
-        // every key. We could increment/decrement a counter on each step
-        // instead, but that'd require a bit more complex bookkeeping.
-        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
-        let mut max_depth = 0;
-        let mut events_iter = events.iter().peekable();
-        while let Some(e) = events_iter.next() {
-            let l = &self.layers[e.layer_idx];
-            let is_image = !l.is_delta();
-
-            // update the active set
-            if e.start {
-                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
-            } else {
-                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
-            }
-
-            // recalculate depth if this was the last event at this point
-            let more_events_at_this_key = events_iter
-                .peek()
-                .map_or(false, |next_e| next_e.key == e.key);
-            if !more_events_at_this_key {
-                let mut active_depth = 0;
-                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
-                    if *is_image {
-                        break;
-                    }
-                    active_depth += 1;
-                }
-                if active_depth > max_depth {
-                    max_depth = active_depth;
-                }
-            }
-        }
-        debug_assert_eq!(active_set, BTreeSet::new());
-        max_depth
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
-    use std::sync::{Arc, Mutex};
-
-    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
-        MockLayer::Delta(Arc::new(MockDeltaLayer {
-            key_range,
-            lsn_range,
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-            records: vec![],
-        }))
-    }
-
-    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
-        MockLayer::Image(Arc::new(MockImageLayer {
-            key_range,
-            lsn_range: lsn..(lsn + 1),
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-        }))
-    }
-
-    #[tokio::test]
-    async fn test_identify_level() -> anyhow::Result<()> {
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
-        ];
-
-        // All layers fit in the max file size
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 6);
-
-        // Same LSN with smaller max file size. The second layer from the top is larger
-        // and belongs to next level.
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        // Call with a smaller LSN
-        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 2);
-
-        // Call with an LSN that doesn't partition the space
-        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
-        // The files LSN ranges overlap, so even though there are more files that
-        // fit under the file size, they are not included in the level because they
-        // overlap so that we'd need to include the oldest file, too, which is
-        // larger
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
-        // The key ranges don't overlap, so depth is only 1.
-        let layers = vec![
-            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
-            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 1);
-
-        // Staggered. The 1st and 3rd layer don't overlap with each other.
-        let layers = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 2);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_images() -> anyhow::Result<()> {
-        let layers: Vec<MockLayer> = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-            // This covers the same key range as the 2nd delta layer. The depth
-            // in that key range is therefore 0.
-            image(1500..2500, Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 4);
-        assert_eq!(level.depth(), 1);
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -1,166 +0,0 @@
-//! This is what the compaction implementation needs to know about
-//! layers, keyspace etc.
-//!
-//! All the heavy lifting is done by the create_image and create_delta
-//! functions that the implementor provides.
-use async_trait::async_trait;
-use futures::Future;
-use pageserver_api::{key::Key, keyspace::key_range_size};
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-/// Public interface. This is the main thing that the implementor needs to provide
-pub trait CompactionJobExecutor {
-    // Type system.
-    //
-    // We assume that there are two kinds of layers, deltas and images. The
-    // compaction doesn't distinguish whether they are stored locally or
-    // remotely.
-    //
-    // The keyspace is defined by the CompactionKey trait.
-    type Key: CompactionKey;
-
-    type Layer: CompactionLayer<Self::Key> + Clone;
-    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
-    type ImageLayer: CompactionImageLayer<Self> + Clone;
-
-    // This is passed through to all the interface functions. The compaction
-    // implementation doesn't do anything with it, but it might be useful for
-    // the interface implementation.
-    type RequestContext: CompactionRequestContext;
-
-    // ----
-    // Functions that the planner uses to support its decisions
-    // ----
-
-    /// Return all layers that overlap the given bounding box.
-    fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        ctx: &Self::RequestContext,
-    ) -> impl Future<Output = anyhow::Result<Vec<Self::Layer>>> + Send;
-
-    fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn: Lsn,
-        ctx: &Self::RequestContext,
-    ) -> impl Future<Output = anyhow::Result<CompactionKeySpace<Self::Key>>> + Send;
-
-    /// NB: This is a pretty expensive operation. In the real pageserver
-    /// implementation, it downloads the layer, and keeps it resident
-    /// until the DeltaLayer is dropped.
-    fn downcast_delta_layer(
-        &self,
-        layer: &Self::Layer,
-    ) -> impl Future<Output = anyhow::Result<Option<Self::DeltaLayer>>> + Send;
-
-    // ----
-    // Functions to execute the plan
-    // ----
-
-    /// Create a new image layer, materializing all the values in the key range,
-    /// at given 'lsn'.
-    fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Self::Key>,
-        ctx: &Self::RequestContext,
-    ) -> impl Future<Output = anyhow::Result<()>> + Send;
-
-    /// Create a new delta layer, containing all the values from 'input_layers'
-    /// in the given key and LSN range.
-    fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Self::Key>,
-        input_layers: &[Self::DeltaLayer],
-        ctx: &Self::RequestContext,
-    ) -> impl Future<Output = anyhow::Result<()>> + Send;
-
-    /// Delete a layer. The compaction implementation will call this only after
-    /// all the create_image() or create_delta() calls that deletion of this
-    /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage.
-    /// it is the implementation's responsibility to track those.
-    fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        ctx: &Self::RequestContext,
-    ) -> impl Future<Output = anyhow::Result<()>> + Send;
-}
-
-pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
-    const MIN: Self;
-    const MAX: Self;
-
-    /// Calculate distance between key_range.start and key_range.end.
-    ///
-    /// This returns u32, for compatibility with Repository::key. If the
-    /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
-
-    // return "self + 1"
-    fn next(&self) -> Self;
-
-    // return "self + <some decent amount to skip>". The amount to skip
-    // is left to the implementation.
-    // FIXME: why not just "add(u32)" ?  This is hard to use
-    fn skip_some(&self) -> Self;
-}
-
-impl CompactionKey for Key {
-    const MIN: Self = Self::MIN;
-    const MAX: Self = Self::MAX;
-
-    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
-        key_range_size(r)
-    }
-    fn next(&self) -> Key {
-        (self as &Key).next()
-    }
-    fn skip_some(&self) -> Key {
-        self.add(128)
-    }
-}
-
-/// Contiguous ranges of keys that belong to the key space. In key order, and
-/// with no overlap.
-pub type CompactionKeySpace<K> = Vec<Range<K>>;
-
-/// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
-    fn key_range(&self) -> &Range<K>;
-    fn lsn_range(&self) -> &Range<Lsn>;
-
-    fn file_size(&self) -> u64;
-
-    /// For debugging, short human-readable representation of the layer. E.g. filename.
-    fn short_id(&self) -> String;
-
-    fn is_delta(&self) -> bool;
-}
-
-#[async_trait]
-pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
-    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
-    where
-        Self: 'a;
-
-    /// Return all keys in this delta layer.
-    async fn load_keys<'a>(
-        &self,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
-}
-
-pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
-
-pub trait CompactionDeltaEntry<'a, K> {
-    fn key(&self) -> K;
-    fn lsn(&self) -> Lsn;
-    fn size(&self) -> u64;
-}
-
-pub trait CompactionRequestContext {}
--- a/pageserver/compaction/src/lib.rs
+++ b/pageserver/compaction/src/lib.rs
@@ -1,12 +0,0 @@
-// The main module implementing the compaction algorithm
-pub mod compact_tiered;
-pub(crate) mod identify_levels;
-
-// Traits that the caller of the compaction needs to implement
-pub mod interface;
-
-// Utility functions, useful for the implementation
-pub mod helpers;
-
-// A simulator with mock implementations of 'interface'
-pub mod simulator;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -1,612 +0,0 @@
-mod draw;
-
-use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
-
-use async_trait::async_trait;
-use futures::StreamExt;
-use rand::Rng;
-use tracing::info;
-
-use utils::lsn::Lsn;
-
-use std::fmt::Write;
-use std::ops::Range;
-use std::sync::Arc;
-use std::sync::Mutex;
-
-use crate::helpers::{merge_delta_keys, overlaps_with};
-
-use crate::interface;
-use crate::interface::CompactionLayer;
-
-//
-// Implementation for the CompactionExecutor interface
-//
-pub struct MockTimeline {
-    // Parameters for the compaction algorithm
-    pub target_file_size: u64,
-    tiers_per_level: u64,
-
-    num_l0_flushes: u64,
-    last_compact_at_flush: u64,
-    last_flush_lsn: Lsn,
-
-    // In-memory layer
-    records: Vec<MockRecord>,
-    total_len: u64,
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-
-    // Current keyspace at `end_lsn`. This is updated on every ingested record.
-    keyspace: KeySpace,
-
-    // historic keyspaces
-    old_keyspaces: Vec<(Lsn, KeySpace)>,
-
-    // "on-disk" layers
-    pub live_layers: Vec<MockLayer>,
-
-    num_deleted_layers: u64,
-
-    // Statistics
-    wal_ingested: u64,
-    bytes_written: u64,
-    bytes_deleted: u64,
-    layers_created: u64,
-    layers_deleted: u64,
-
-    // All the events - creation and deletion of files - are collected
-    // in 'history'. It is used to draw the SVG animation at the end.
-    time: u64,
-    history: Vec<draw::LayerTraceEvent>,
-}
-
-type KeySpace = interface::CompactionKeySpace<Key>;
-
-pub struct MockRequestContext {}
-impl interface::CompactionRequestContext for MockRequestContext {}
-
-pub type Key = u64;
-
-impl interface::CompactionKey for Key {
-    const MIN: Self = u64::MIN;
-    const MAX: Self = u64::MAX;
-
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
-        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
-    }
-
-    fn next(&self) -> Self {
-        self + 1
-    }
-    fn skip_some(&self) -> Self {
-        // round up to next xx
-        self + 100
-    }
-}
-
-#[derive(Clone)]
-pub struct MockRecord {
-    lsn: Lsn,
-    key: Key,
-    len: u64,
-}
-
-impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
-    fn key(&self) -> Key {
-        self.key
-    }
-    fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-    fn size(&self) -> u64 {
-        self.len
-    }
-}
-
-pub struct MockDeltaLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-
-    pub records: Vec<MockRecord>,
-}
-
-impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}-{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        true
-    }
-}
-
-#[async_trait]
-impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
-    type DeltaEntry<'a> = MockRecord;
-
-    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
-        Ok(self.records.clone())
-    }
-}
-
-pub struct MockImageLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-}
-
-impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
-
-impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        false
-    }
-}
-
-impl MockTimeline {
-    pub fn new() -> Self {
-        MockTimeline {
-            target_file_size: 256 * 1024 * 1024,
-            tiers_per_level: 4,
-
-            num_l0_flushes: 0,
-            last_compact_at_flush: 0,
-            last_flush_lsn: Lsn(0),
-
-            records: Vec::new(),
-            total_len: 0,
-            start_lsn: Lsn(1000),
-            end_lsn: Lsn(1000),
-            keyspace: KeySpace::new(),
-
-            old_keyspaces: vec![],
-
-            live_layers: vec![],
-
-            num_deleted_layers: 0,
-
-            wal_ingested: 0,
-            bytes_written: 0,
-            bytes_deleted: 0,
-            layers_created: 0,
-            layers_deleted: 0,
-
-            time: 0,
-            history: Vec::new(),
-        }
-    }
-
-    pub async fn compact(&mut self) -> anyhow::Result<()> {
-        let ctx = MockRequestContext {};
-
-        crate::compact_tiered::compact_tiered(
-            self,
-            self.last_flush_lsn,
-            self.target_file_size,
-            self.tiers_per_level,
-            &ctx,
-        )
-        .await?;
-
-        Ok(())
-    }
-
-    // Ingest one record to the timeline
-    pub fn ingest_record(&mut self, key: Key, len: u64) {
-        self.records.push(MockRecord {
-            lsn: self.end_lsn,
-            key,
-            len,
-        });
-        self.total_len += len;
-        self.end_lsn += len;
-
-        if self.total_len > self.target_file_size {
-            self.flush_l0();
-        }
-    }
-
-    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
-        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
-            self.compact().await?;
-            self.last_compact_at_flush = self.num_l0_flushes;
-        }
-        Ok(())
-    }
-
-    pub fn flush_l0(&mut self) {
-        if self.records.is_empty() {
-            return;
-        }
-
-        let mut records = std::mem::take(&mut self.records);
-        records.sort_by_key(|rec| rec.key);
-
-        let lsn_range = self.start_lsn..self.end_lsn;
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: Key::MIN..Key::MAX,
-            lsn_range: lsn_range.clone(),
-            file_size: self.total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!("flushed L0 layer {}", new_layer.short_id());
-        self.live_layers.push(MockLayer::from(&new_layer));
-
-        // reset L0
-        self.start_lsn = self.end_lsn;
-        self.total_len = 0;
-        self.records = Vec::new();
-
-        self.layers_created += 1;
-        self.bytes_written += new_layer.file_size;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Flush,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        self.num_l0_flushes += 1;
-        self.last_flush_lsn = self.end_lsn;
-    }
-
-    // Ingest `num_records' records to the timeline, with random keys
-    // uniformly distributed in `key_range`
-    pub fn ingest_uniform(
-        &mut self,
-        num_records: u64,
-        len: u64,
-        key_range: &Range<Key>,
-    ) -> anyhow::Result<()> {
-        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
-        let mut rng = rand::thread_rng();
-        for _ in 0..num_records {
-            self.ingest_record(rng.gen_range(key_range.clone()), len);
-            self.wal_ingested += len;
-        }
-        Ok(())
-    }
-
-    pub fn stats(&self) -> anyhow::Result<String> {
-        let mut s = String::new();
-
-        writeln!(s, "STATISTICS:")?;
-        writeln!(
-            s,
-            "WAL ingested:   {:>10} MB",
-            self.wal_ingested / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size created:   {:>10} MB",
-            self.bytes_written / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size deleted:   {:>10} MB",
-            self.bytes_deleted / (1024 * 1024)
-        )?;
-        writeln!(s, "files created:     {:>10}", self.layers_created)?;
-        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
-        writeln!(
-            s,
-            "write amp:         {:>10.2}",
-            self.bytes_written as f64 / self.wal_ingested as f64
-        )?;
-        writeln!(
-            s,
-            "storage amp:       {:>10.2}",
-            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
-        )?;
-
-        Ok(s)
-    }
-
-    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
-        draw::draw_history(&self.history, output)
-    }
-}
-
-impl Default for MockTimeline {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Clone)]
-pub enum MockLayer {
-    Delta(Arc<MockDeltaLayer>),
-    Image(Arc<MockImageLayer>),
-}
-
-impl interface::CompactionLayer<Key> for MockLayer {
-    fn key_range(&self) -> &Range<Key> {
-        match self {
-            MockLayer::Delta(this) => this.key_range(),
-            MockLayer::Image(this) => this.key_range(),
-        }
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        match self {
-            MockLayer::Delta(this) => this.lsn_range(),
-            MockLayer::Image(this) => this.lsn_range(),
-        }
-    }
-    fn file_size(&self) -> u64 {
-        match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
-        }
-    }
-    fn short_id(&self) -> String {
-        match self {
-            MockLayer::Delta(this) => this.short_id(),
-            MockLayer::Image(this) => this.short_id(),
-        }
-    }
-
-    fn is_delta(&self) -> bool {
-        match self {
-            MockLayer::Delta(_) => true,
-            MockLayer::Image(_) => false,
-        }
-    }
-}
-
-impl MockLayer {
-    fn is_deleted(&self) -> bool {
-        let guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        *guard
-    }
-    fn mark_deleted(&self) {
-        let mut deleted_guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        assert!(!*deleted_guard, "layer already deleted");
-        *deleted_guard = true;
-    }
-}
-
-impl From<&Arc<MockDeltaLayer>> for MockLayer {
-    fn from(l: &Arc<MockDeltaLayer>) -> Self {
-        MockLayer::Delta(l.clone())
-    }
-}
-
-impl From<&Arc<MockImageLayer>> for MockLayer {
-    fn from(l: &Arc<MockImageLayer>) -> Self {
-        MockLayer::Image(l.clone())
-    }
-}
-
-impl interface::CompactionJobExecutor for MockTimeline {
-    type Key = Key;
-    type Layer = MockLayer;
-    type DeltaLayer = Arc<MockDeltaLayer>;
-    type ImageLayer = Arc<MockImageLayer>;
-    type RequestContext = MockRequestContext;
-
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>> {
-        // Clear any deleted layers from our vec
-        self.live_layers.retain(|l| !l.is_deleted());
-
-        let layers: Vec<MockLayer> = self
-            .live_layers
-            .iter()
-            .filter(|l| {
-                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
-            })
-            .cloned()
-            .collect();
-
-        Ok(layers)
-    }
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        _lsn: Lsn,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
-        // find it in the levels
-        if self.old_keyspaces.is_empty() {
-            Ok(crate::helpers::intersect_keyspace(
-                &self.keyspace,
-                key_range,
-            ))
-        } else {
-            // not implemented
-
-            // The mock implementation only allows requesting the
-            // keyspace at the level's end LSN. That's all that the
-            // current implementation needs.
-            panic!("keyspace not available for requested lsn");
-        }
-    }
-
-    async fn downcast_delta_layer(
-        &self,
-        layer: &MockLayer,
-    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
-        Ok(match layer {
-            MockLayer::Delta(l) => Some(l.clone()),
-            MockLayer::Image(_) => None,
-        })
-    }
-
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Key>,
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
-
-        let mut accum_size: u64 = 0;
-        for r in keyspace {
-            accum_size += r.end - r.start;
-        }
-
-        let new_layer = Arc::new(MockImageLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created image layer, size {}: {}",
-            new_layer.file_size,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Image(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += new_layer.file_size;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateImage,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Key>,
-        input_layers: &[Arc<MockDeltaLayer>],
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let mut key_value_stream =
-            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
-        let mut records: Vec<MockRecord> = Vec::new();
-        let mut total_len = 2;
-        while let Some(delta_entry) = key_value_stream.next().await {
-            let delta_entry: MockRecord = delta_entry?;
-            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
-                total_len += delta_entry.len;
-                records.push(delta_entry);
-            }
-        }
-        let total_records = records.len();
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn_range.clone(),
-            file_size: total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created delta layer, recs {}, size {}: {}",
-            total_records,
-            total_len,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += total_len;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateDelta,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        _ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let layer = std::pin::pin!(layer);
-        info!("deleting layer: {}", layer.short_id());
-        self.num_deleted_layers += 1;
-        self.bytes_deleted += layer.file_size();
-        layer.mark_deleted();
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Delete,
-            file: LayerTraceFile {
-                filename: layer.short_id(),
-                key_range: layer.key_range().clone(),
-                lsn_range: layer.lsn_range().clone(),
-            },
-        });
-
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -1,411 +0,0 @@
-use super::Key;
-use anyhow::Result;
-use std::cmp::Ordering;
-use std::{
-    collections::{BTreeMap, BTreeSet, HashSet},
-    fmt::Write,
-    ops::Range,
-};
-use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
-use utils::lsn::Lsn;
-
-// Map values to their compressed coordinate - the index the value
-// would have in a sorted and deduplicated list of all values.
-struct CoordinateMap<T: Ord + Copy> {
-    map: BTreeMap<T, usize>,
-    stretch: f32,
-}
-
-impl<T: Ord + Copy> CoordinateMap<T> {
-    fn new(coords: Vec<T>, stretch: f32) -> Self {
-        let set: BTreeSet<T> = coords.into_iter().collect();
-
-        let mut map: BTreeMap<T, usize> = BTreeMap::new();
-        for (i, e) in set.iter().enumerate() {
-            map.insert(*e, i);
-        }
-
-        Self { map, stretch }
-    }
-
-    // This assumes that the map contains an exact point for this.
-    // Use map_inexact for values inbetween
-    fn map(&self, val: T) -> f32 {
-        *self.map.get(&val).unwrap() as f32 * self.stretch
-    }
-
-    // the value is still assumed to be within the min/max bounds
-    // (this is currently unused)
-    fn _map_inexact(&self, val: T) -> f32 {
-        let prev = *self.map.range(..=val).next().unwrap().1;
-        let next = *self.map.range(val..).next().unwrap().1;
-
-        // interpolate
-        (prev as f32 + (next - prev) as f32) * self.stretch
-    }
-
-    fn max(&self) -> f32 {
-        self.map.len() as f32 * self.stretch
-    }
-}
-
-#[derive(PartialEq, Hash, Eq)]
-pub enum LayerTraceOp {
-    Flush,
-    CreateDelta,
-    CreateImage,
-    Delete,
-}
-
-impl std::fmt::Display for LayerTraceOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        let op_str = match self {
-            LayerTraceOp::Flush => "flush",
-            LayerTraceOp::CreateDelta => "create_delta",
-            LayerTraceOp::CreateImage => "create_image",
-            LayerTraceOp::Delete => "delete",
-        };
-        f.write_str(op_str)
-    }
-}
-
-#[derive(PartialEq, Hash, Eq, Clone)]
-pub struct LayerTraceFile {
-    pub filename: String,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-}
-
-impl LayerTraceFile {
-    fn is_image(&self) -> bool {
-        self.lsn_range.end == self.lsn_range.start
-    }
-}
-
-pub struct LayerTraceEvent {
-    pub time_rel: u64,
-    pub op: LayerTraceOp,
-    pub file: LayerTraceFile,
-}
-
-pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
-    let mut files: Vec<LayerTraceFile> = Vec::new();
-
-    for event in history {
-        files.push(event.file.clone());
-    }
-    let last_time_rel = history.last().unwrap().time_rel;
-
-    // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
-    for f in files.iter() {
-        keys.push(f.key_range.start);
-        keys.push(f.key_range.end);
-        lsns.push(f.lsn_range.start);
-        lsns.push(f.lsn_range.end);
-    }
-
-    // Analyze
-    let key_map = CoordinateMap::new(keys, 2.0);
-    // Stretch out vertically for better visibility
-    let lsn_map = CoordinateMap::new(lsns, 3.0);
-
-    let mut svg = String::new();
-
-    // Draw
-    writeln!(
-        svg,
-        "{}",
-        BeginSvg {
-            w: key_map.max(),
-            h: lsn_map.max(),
-        }
-    )?;
-    let lsn_max = lsn_map.max();
-
-    // Sort the files by LSN, but so that image layers go after all delta layers
-    // The SVG is painted in the order the elements appear, and we want to draw
-    // image layers on top of the delta layers if they overlap
-    //
-    // (This could also be implemented via z coordinates: image layers get one z
-    // coord, delta layers get another z coord.)
-    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
-    files_sorted.sort_by(|a, b| {
-        if a.is_image() && !b.is_image() {
-            Ordering::Greater
-        } else if !a.is_image() && b.is_image() {
-            Ordering::Less
-        } else {
-            a.lsn_range.end.cmp(&b.lsn_range.end)
-        }
-    });
-
-    writeln!(svg, "<!-- layers -->")?;
-    let mut files_seen = HashSet::new();
-    for f in files_sorted {
-        if files_seen.contains(&f) {
-            continue;
-        }
-        let key_start = key_map.map(f.key_range.start);
-        let key_end = key_map.map(f.key_range.end);
-        let key_diff = key_end - key_start;
-
-        if key_start >= key_end {
-            panic!("Invalid key range {}-{}", key_start, key_end);
-        }
-
-        let lsn_start = lsn_map.map(f.lsn_range.start);
-        let lsn_end = lsn_map.map(f.lsn_range.end);
-
-        // Fill in and thicken rectangle if it's an
-        // image layer so that we can see it.
-        let mut style = Style::default();
-        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
-
-        let y_start = lsn_max - lsn_start;
-        let y_end = lsn_max - lsn_end;
-
-        let x_margin = 0.25;
-        let y_margin = 0.5;
-
-        match f.lsn_range.start.cmp(&f.lsn_range.end) {
-            Ordering::Less => {
-                write!(
-                    svg,
-                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end + y_margin,
-                    key_diff - x_margin * 2.0,
-                    y_start - y_end - y_margin * 2.0,
-                    1.0, // border_radius,
-                    style,
-                )?;
-                write!(svg, "<title>{}</title>", f.filename)?;
-                writeln!(svg, "</rect>")?;
-            }
-            Ordering::Equal => {
-                //lsn_diff = 0.3;
-                //lsn_offset = -lsn_diff / 2.0;
-                //margin = 0.05;
-                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
-                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
-                write!(
-                    svg,
-                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end,
-                    key_end - x_margin,
-                    y_end,
-                    style,
-                )?;
-                write!(
-                    svg,
-                    "<title>{}<br>{} - {}</title>",
-                    f.filename, lsn_end, y_end
-                )?;
-                writeln!(svg, "</line>")?;
-            }
-            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
-        }
-        files_seen.insert(f);
-    }
-
-    let mut record_style = Style::default();
-    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-    record_style.stroke = Stroke::None;
-
-    writeln!(svg, "{}", EndSvg)?;
-
-    let mut layer_events_str = String::new();
-    let mut first = true;
-    for e in history {
-        if !first {
-            writeln!(layer_events_str, ",")?;
-        }
-        write!(
-            layer_events_str,
-            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
-            e.time_rel, e.file.filename, e.op
-        )?;
-        first = false;
-    }
-    writeln!(layer_events_str)?;
-
-    writeln!(
-        output,
-        r#"<!DOCTYPE html>
-<html>
-<head>
-<style>
-/* Keep the slider pinned at top */
-.topbar {{
-  display: block;
-  overflow: hidden;
-  background-color: lightgrey;
-  position: fixed;
-  top: 0;
-  width: 100%;
-/*  width: 500px; */
-}}
-.slidercontainer {{
-  float: left;
-  width: 50%;
-  margin-right: 200px;
-}}
-.slider {{
-  float: left;
-  width: 100%;
-}}
-.legend {{
-  width: 200px;
-  float: right;
-}}
-
-/* Main content */
-.main {{
-  margin-top: 50px; /* Add a top margin to avoid content overlay */
-}}
-</style>
-</head>
-
-  <body onload="init()">
-    <script type="text/javascript">
-
-      var layer_events = [{layer_events_str}]
-
-      let ticker;
-
-      function init() {{
-          for (let i = 0; i < layer_events.length; i++) {{
-              var layer = document.getElementById("layer_" + layer_events[i].filename);
-              layer.style.visibility = "hidden";
-          }}
-          last_layer_event = -1;
-          moveSlider(last_slider_pos)
-      }}
-
-      function startAnimation() {{
-          ticker = setInterval(animateStep, 100);
-      }}
-      function stopAnimation() {{
-          clearInterval(ticker);
-      }}
-
-      function animateStep() {{
-          if (last_layer_event < layer_events.length - 1) {{
-              var slider = document.getElementById("time-slider");
-              let prevPos = slider.value
-              let nextEvent = last_layer_event + 1
-              while (nextEvent <= layer_events.length - 1) {{
-                  if (layer_events[nextEvent].time_rel > prevPos) {{
-                      break;
-                  }}
-                  nextEvent += 1;
-              }}
-              let nextPos = layer_events[nextEvent].time_rel
-              slider.value = nextPos
-              moveSlider(nextPos)
-          }}
-      }}
-
-      function redoLayerEvent(n, dir) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "visible";
-                  break;
-              case "delete":
-                  layer.style.visibility = "hidden";
-                  break;
-          }}
-      }}
-      function undoLayerEvent(n) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "delete":
-                  layer.style.visibility = "visible";
-                  break;
-          }}
-      }}
-
-      var last_slider_pos = 0
-      var last_layer_event = 0
-
-      var moveSlider = function(new_pos) {{
-          if (new_pos > last_slider_pos) {{
-              while (last_layer_event < layer_events.length - 1) {{
-                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
-                      break;
-                  }}
-                  last_layer_event += 1;
-                  redoLayerEvent(last_layer_event)
-              }}
-          }}
-          if (new_pos < last_slider_pos) {{
-              while (last_layer_event >= 0) {{
-                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
-                      break;
-                  }}
-                  undoLayerEvent(last_layer_event)
-                  last_layer_event -= 1;
-              }}
-          }}
-          last_slider_pos = new_pos;
-          document.getElementById("debug_pos").textContent=new_pos;
-          if (last_layer_event >= 0) {{
-              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
-          }} else {{
-              document.getElementById("debug_layer_event").textContent="begin";
-          }}
-      }}
-    </script>
-
-    <div class="topbar">
-      <div class="slidercontainer">
-        <label for="time-slider">TIME</label>:
-        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
-
-        pos: <span id="debug_pos"></span><br>
-        event: <span id="debug_layer_event"></span><br>
-        gc: <span id="debug_gc_event"></span><br>
-      </div>
-
-      <button onclick="startAnimation()">Play</button>
-      <button onclick="stopAnimation()">Stop</button>
-
-      <svg class="legend">
-        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-      </svg>
-    </div>
-
-    <div class="main">
-{svg}
-    </div>
-  </body>
-</html>
-"#
-    )?;
-
-    Ok(())
-}
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,35 +0,0 @@
-use pageserver_compaction::interface::CompactionLayer;
-use pageserver_compaction::simulator::MockTimeline;
-
-/// Test the extreme case that there are so many updates for a single key that
-/// even if we produce an extremely narrow delta layer, spanning just that one
-/// key, we still too many records to fit in the target file size. We need to
-/// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
-#[tokio::test]
-async fn test_many_updates_for_single_key() {
-    let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
-
-    // Ingest 100 MB of updates to a single key.
-    for _ in 1..1000 {
-        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
-        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
-        executor.compact().await.unwrap();
-    }
-
-    // Check that all the layers are smaller than the target size (with some slop)
-    for l in executor.live_layers.iter() {
-        println!("layer {}: {}", l.short_id(), l.file_size());
-    }
-    for l in executor.live_layers.iter() {
-        assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
-        if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
-        }
-    }
-}
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, str};

-use pageserver::page_cache::{self, PAGE_SZ};
+use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,15 +100,13 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = VirtualFile::open(path).await?;
-    let file_id = page_cache::next_file_id();
-    let block_reader = FileBlockReader::new(&file, file_id);
-    let summary_blk = block_reader.read_blk(0, ctx).await?;
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        block_reader,
+        file,
    );
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Konstantin Knizhnik	2831a17bfa	Limit number of AUX files deltas to reduce reconstruct time	2024-02-22 08:57:09 +02:00
Konstantin Knizhnik	55574da76d	Bump Postgres version	2024-02-21 21:32:14 +02:00
Konstantin Knizhnik	997093b7cd	Bu,p postgres versions	2024-02-21 21:30:46 +02:00
Konstantin Knizhnik	98f51df0b1	Flush logical messages with snapshots and replication origin	2024-02-21 21:30:46 +02:00