Add test for replay of HEAP_LOCK VM updates.

see https://github.com/neondatabase/neon/pull/4896
Fix lfc_ensure_function which now disables LFC (#5294 )
2026-02-12 07:00:36 +00:00 · 2023-09-13 12:03:49 +03:00 · 2023-09-13 08:56:03 +03:00 · 2023-09-12 17:13:25 -04:00 · 2023-09-12 22:02:03 +03:00 · 2023-09-12 20:01:21 +01:00
194 changed files with 11634 additions and 5863 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -14,10 +14,12 @@
 !pgxn/
 !proxy/
 !safekeeper/
+!s3_scrubber/
 !storage_broker/
 !trace/
 !vendor/postgres-v14/
 !vendor/postgres-v15/
+!vendor/postgres-v16/
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -0,0 +1,8 @@
+self-hosted-runner:
+  labels:
+    - gen3
+    - large
+    - small
+    - us-east-2
+config-variables:
+  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -70,6 +70,9 @@ runs:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest
+        # The lack of compatibility snapshot (for example, for the new Postgres version)
+        # shouldn't fail the whole job. Only relevant test should fail.
+        skip-if-does-not-exist: true

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -0,0 +1,31 @@
+name: Lint GitHub Workflows
+
+on:
+  push:
+    branches:
+      - main
+      - release
+    paths:
+      - '.github/workflows/*.ya?ml'
+  pull_request:
+    paths:
+      - '.github/workflows/*.ya?ml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: reviewdog/action-actionlint@v1
+        env:
+          # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046
+          # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2086
+        with:
+          fail_on_error: true
+          filter_mode: nofilter
+          level: error
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -2,7 +2,9 @@ name: Handle `approved-for-ci-run` label
 # This workflow helps to run CI pipeline for PRs made by external contributors (from forks).

 on:
-  pull_request:
+  pull_request_target:
+    branches:
+      - main
    types:
      # Default types that triggers a workflow ([1]):
      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
@@ -17,39 +19,83 @@ on:
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
+  BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"
+
+permissions: write-all
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}

 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

-    runs-on: [ ubuntu-latest ]
-
    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

-  create-branch:
-    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
-
-    runs-on: [ ubuntu-latest ]
+  create-or-update-pr-for-ci-run:
+    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.

    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

      - uses: actions/checkout@v3
        with:
          ref: main
+          token: ${{ secrets.CI_ACCESS_TOKEN }}

      - run: gh pr checkout "${PR_NUMBER}"

-      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
+      - run: git checkout -b "${BRANCH}"

-      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
+      - run: git push --force origin "${BRANCH}"
+
+      - name: Create a Pull Request for CI run (if required)
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          cat << EOF > body.md
+            This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER}
+
+            Please do not alter or merge/close it.
+
+            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
+          EOF
+
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --base main --json number --jq '.[].number')"
+          if [ -z "${ALREADY_CREATED}" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
+                                                       --body-file "body.md" \
+                                                       --head "${BRANCH}" \
+                                                       --base "main" \
+                                                       --draft
+          fi
+
+  cleanup:
+    # Close PRs and delete branchs if the original PR is closed.
+
+    if: |
+      github.event.action == 'closed' &&
+      github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - run: |
+          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --json 'closed' --jq '.[].closed')"
+          if [ "${CLOSED}" == "false" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
+          fi
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -117,6 +117,7 @@ jobs:
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
+      tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}

    steps:
    - name: Generate matrix for pgbench benchmark
@@ -136,11 +137,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

    - name: Generate matrix for OLAP benchmarks
      id: olap-compare-matrix
@@ -152,11 +153,30 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                   { "platform": "rds-aurora"   }]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+    - name: Generate matrix for TPC-H benchmarks
+      id: tpch-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-reuse"
+          ],
+          "scale": [
+            "10"
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
+                                                   { "platform": "rds-aurora",   "scale": "10" }]')
+        fi
+
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
    needs: [ generate-matrices ]
@@ -233,7 +253,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -358,7 +382,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -372,6 +400,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: 10

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -398,7 +427,7 @@ jobs:

    strategy:
      fail-fast: false
-      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -407,6 +436,7 @@ jobs:
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
+      TEST_OLAP_SCALE: ${{ matrix.scale }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -428,18 +458,17 @@ jobs:
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

-    - name: Set up Connection String
-      id: set-up-connstr
+    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
          neon-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -447,9 +476,21 @@ jobs:
            ;;
        esac

+        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
+        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
+
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -463,6 +504,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: ${{ matrix.scale }}

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -534,7 +576,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -24,7 +23,30 @@ env:
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
+  check-permissions:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Disallow PRs from forks
+      if: |
+        github.event_name == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1
+
+
  tag:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
@@ -53,6 +75,7 @@ jobs:
        id: build-tag

  check-codestyle-python:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -85,6 +108,7 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -151,6 +175,7 @@ jobs:
        run: cargo deny check

  build-neon:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,7 +212,7 @@ jobs:
          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603

          FAILED=false
-          for postgres in postgres-v14 postgres-v15; do
+          for postgres in postgres-v14 postgres-v15 postgres-v16; do
            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
            actual=$(git rev-parse "HEAD:vendor/${postgres}")
            if [ "${expected}" != "${actual}" ]; then
@@ -209,6 +234,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      # Set some environment variables used by all the steps.
      #
      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -229,10 +258,12 @@ jobs:
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
          fi
-          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
-          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
-          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
-          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV

      # Disabled for now
      # Don't include the ~/.cargo/registry/src directory. It contains just
@@ -267,6 +298,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
        run: mold -run make postgres-v14 -j$(nproc)
@@ -275,6 +313,10 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: mold -run make postgres-v15 -j$(nproc)

+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

@@ -348,17 +390,17 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
-        pg_version: [ v14, v15 ]
+        pg_version: [ v14, v15, v16 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -386,12 +428,12 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
@@ -418,12 +460,13 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests, benchmarks ]
-    if: ${{ !cancelled() }}

    steps:
      - uses: actions/checkout@v3
@@ -449,42 +492,40 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

+            const coverage = {
+              coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}",
+              summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}",
+            }
+
            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
              fetch,
              report,
+              coverage,
            })

  coverage-report:
+    needs: [ check-permissions, regress-tests ]
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug ]
+    outputs:
+        coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
+        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: true
-          fetch-depth: 1
-
-#      Disabled for now
-#      - name: Restore cargo deps cache
-#        id: cache_cargo
-#        uses: actions/cache@v3
-#        with:
-#          path: |
-#            ~/.cargo/registry/
-#            !~/.cargo/registry/src
-#            ~/.cargo/git/
-#            target/
-#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+          fetch-depth: 0

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -527,13 +568,45 @@ jobs:
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

+      - name: Build coverage report NEW
+        id: upload-coverage-report-new
+        env:
+          BUCKET: neon-github-public-dev
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          BASELINE="$(git merge-base HEAD origin/main)"
+          CURRENT="${COMMIT_SHA}"
+
+          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
+
+          GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
+
+          # Use differential coverage if the baseline coverage exists.
+          # It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit.
+          if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then
+            git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff
+
+            GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}"
+          fi
+
+          genhtml ${GENHTML_ARGS}
+
+          aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html
+          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
+          echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
+
      - uses: actions/github-script@v6
        env:
          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
+          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
          script: |
-            const { REPORT_URL, COMMIT_SHA } = process.env
+            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env

            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
@@ -544,12 +617,21 @@ jobs:
              context: 'Code coverage report',
            })

+            await github.rest.repos.createCommitStatus({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              sha: `${COMMIT_SHA}`,
+              state: 'success',
+              target_url: `${REPORT_URL_NEW}`,
+              context: 'Code coverage report NEW',
+            })
+
  trigger-e2e-tests:
+    needs: [ check-permissions, promote-images, tag ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -590,8 +672,8 @@ jobs:
            }"

  neon-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -638,7 +720,7 @@ jobs:

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
+    needs: [ check-permissions, tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -683,17 +765,17 @@ jobs:
        run: rm -rf ~/.ecr

  compute-node-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: gcr.io/kaniko-project/executor:v1.9.2-debug
      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
      options: --add-host=download.osgeo.org:140.211.15.30
-    needs: [ tag ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -742,12 +824,12 @@ jobs:
        run: rm -rf ~/.ecr

  vm-compute-node-image:
+    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag, compute-node-image ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -784,7 +866,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -827,8 +909,8 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
+    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: [ self-hosted, gen3, small ]
-    needs: [ tag, test-images, vm-compute-node-image ]
    container: golang:1.19-bullseye
    # Don't add if-condition here.
    # The job should always be run because we have dependant other jobs that shouldn't be skipped
@@ -848,6 +930,7 @@ jobs:
        run: |
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
        if: |
@@ -860,6 +943,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -872,6 +957,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest

      - name: Configure Docker Hub login
        run: |
@@ -883,6 +970,7 @@ jobs:
        run: |
          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
        if: |
@@ -895,21 +983,19 @@ jobs:
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  build-private-extensions:
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
-    needs: [ tag ]
+  trigger-custom-extensions-build-and-wait:
+    needs: [ check-permissions, tag ]
+    runs-on: ubuntu-latest
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

          curl -f -X POST \
@@ -939,11 +1025,50 @@ jobs:
              }
            }"

+      - name: Wait for extension build to finish
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
+          INTERVAL=15 # try each N seconds
+
+          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
+
+          for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do
+            sleep $INTERVAL
+
+            # Get statuses for the latest commit in the PR / branch
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
+
+            # Get the latest status for the "build-and-upload-extensions" context
+            last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
+            if [ "${last_status}" = "pending" ]; then
+              # Extension build is still in progress.
+              continue
+            elif [ "${last_status}" = "success" ]; then
+              # Extension build is successful.
+              exit 0
+            else
+              # Status is neither "pending" nor "success", exit the loop and fail the job.
+              break
+            fi
+          done
+
+          # Extension build failed, print `statuses.json` for debugging and fail the job.
+          jq '.' statuses.json
+
+          echo >&2 "Status of extension build is '${last_status}' != 'success'"
+          exit 1
+
  deploy:
+    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
        run: |
@@ -981,20 +1106,35 @@ jobs:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
-            github.rest.git.createRef({
+            await github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })

+      - name: Create GitHub release
+        if: github.ref_name == 'release'
+        uses: actions/github-script@v6
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            await github.rest.repos.createRelease({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              tag_name: "${{ needs.tag.outputs.build-tag }}",
+              generate_release_notes: true,
+            })
+
  promote-compatibility-data:
+    needs: [ check-permissions, promote-images, tag, regress-tests ]
+    if: github.ref_name == 'release'
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag, regress-tests ]
-    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
    steps:
      - name: Promote compatibility snapshot for the release
        env:
@@ -1002,7 +1142,7 @@ jobs:
          PREFIX: artifacts/latest
        run: |
          # Update compatibility snapshot for the release
-          for pg_version in v14 v15; do
+          for pg_version in v14 v15 v16; do
            for build_type in debug release; do
              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -4,7 +4,6 @@ on:
  push:
    branches:
      - main
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -39,7 +38,7 @@ jobs:
          fetch-depth: 1

      - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf
+        run: brew install flex bison openssl protobuf icu4c pkg-config

      - name: Set pg 14 revision for caching
        id: pg_v14_rev
@@ -49,6 +48,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      - name: Cache postgres v14 build
        id: cache_pg_14
        uses: actions/cache@v3
@@ -63,6 +66,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Set extra env for macOS
        run: |
          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -86,6 +96,10 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: make postgres-v15 -j$(nproc)

+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: make postgres-v16 -j$(nproc)
+
      - name: Build neon extensions
        run: make neon-pg-ext -j$(nproc)

--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -0,0 +1,29 @@
+name: Notify Slack channel about upcoming release
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number }}
+  cancel-in-progress: true
+
+on:
+  pull_request:
+    branches:
+      - release
+    types:
+      # Default types that triggers a workflow:
+      # - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
+      - opened
+      - synchronize
+      - reopened
+      # Additional types that we want to handle:
+      - closed
+
+jobs:
+  notify:
+    runs-on: [ ubuntu-latest ]
+
+    steps:
+      - uses: neondatabase/dev-actions/release-pr-notify@main
+        with:
+          slack-token: ${{ secrets.SLACK_BOT_TOKEN }}
+          slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications`
+          github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,16 +2,19 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 10 * * 2'
+    - cron: '0 7 * * 2'
  workflow_dispatch:

 jobs:
  create_release_branch:
-    runs-on: [ubuntu-latest]
+    runs-on: [ ubuntu-latest ]
+
+    permissions:
+      contents: write # for `git push`

    steps:
    - name: Check out code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
      with:
        ref: main

@@ -26,9 +29,16 @@ jobs:
      run: git push origin releases/${{ steps.date.outputs.date }}

    - name: Create pull request into release
-      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
-      with:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        head: releases/${{ steps.date.outputs.date }}
-        base: release
-        title: Release ${{ steps.date.outputs.date }}
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Release ${{ steps.date.outputs.date }}
+
+          **Please merge this PR using 'Create a merge commit'!**
+        EOF
+
+        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+                     --body-file "body.md" \
+                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --base "release"
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,7 @@
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_15_STABLE_neon
+[submodule "vendor/postgres-v16"]
+	path = vendor/postgres-v16
+	url = https://github.com/neondatabase/postgres.git
+	branch = REL_16_STABLE_neon
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,3 +27,28 @@ your patch's fault. Help to fix the root cause if something else has
 broken the CI, before pushing.

 *Happy Hacking!*
+
+# How to run a CI pipeline on Pull Requests from external contributors
+_An instruction for maintainers_
+
+## TL;DR:
+- Review the PR
+- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
+    - Press the "Approve and run" button in GitHub UI
+    - Add the `approved-for-ci-run` label to the PR
+
+Repeat all steps after any change to the PR.
+- When the changes are ready to get merged — merge the original PR (not the internal one)
+
+## Longer version:
+
+GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons).
+So, passing the CI pipeline on Pull Requests from external contributors is impossible.
+
+We're using the following approach to make it work:
+- After the review, assign the `approved-for-ci-run` label to the PR if changes look safe
+- A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`)
+- Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI)
+- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
+
+For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,4 +1,5 @@
 [workspace]
+resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
@@ -7,6 +8,7 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
+    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -37,11 +39,11 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.27"
-aws-smithy-http = "0.55"
-aws-credential-types = "0.55"
-aws-types = "0.55"
+aws-config = { version = "0.56", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.29"
+aws-smithy-http = "0.56"
+aws-credential-types = "0.56"
+aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -105,12 +107,12 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
-rustls = "0.20"
+rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
-sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
@@ -125,11 +127,11 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.1"
 thiserror = "1.0"
-tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
+tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.9.0"
-tokio-rustls = "0.23"
+tokio-postgres-rustls = "0.10.0"
+tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
@@ -143,7 +145,7 @@ tracing-subscriber = { version = "0.3", default_features = false, features = ["s
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.23"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -182,8 +184,8 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.10"
-rstest = "0.17"
+rcgen = "0.11"
+rstest = "0.18"
 tempfile = "3.4"
 tonic-build = "0.9"

--- a/4
+++ b/4
@@ -12,6 +12,7 @@ WORKDIR /home/nonroot

 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
+COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -39,6 +40,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
@@ -65,6 +67,7 @@ RUN set -e \
    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
+        libicu67 \
        openssl \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
@@ -81,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
+COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,8 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar

 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
-    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
@@ -124,8 +124,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
-    echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.8.tar.gz -O plv8.tar.gz && \
+    echo "92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -172,8 +172,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
    cp -R /h3/usr / && \
    rm -rf build

-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
-    echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -243,8 +243,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
-    echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -307,8 +307,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \
-    echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,8 +323,8 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O i
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
-    echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -339,8 +339,8 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O pr
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
-    echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,8 +355,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -371,12 +371,21 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz
 FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
+    esac && \
+    apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
-    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
@@ -405,6 +414,10 @@ RUN case "${PG_VERSION}" in \
        export PG_HINT_PLAN_VERSION=15_1_5_0 \
        export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
        ;; \
+      "v16") \
+        export PG_HINT_PLAN_VERSION=16_1_6_0 \
+        export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \
+        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
        ;; \
@@ -452,8 +465,8 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
-    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -479,8 +492,8 @@ RUN apt-get update && \
        libfreetype6-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
-    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -551,8 +564,16 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
-    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PG_EMBEDDING_VERSION=0.3.5 \
+        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
+        ;; \
+      *) \
+        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
+    esac && \
+    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
+    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -584,6 +605,10 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 # Layer "rust extensions"
 # This layer is used to build `pgx` deps
 #
+# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
+# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
+# dependency on all the rust extension that depend on it, too.
+#
 #########################################################################################
 FROM build-deps AS rust-extensions-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -598,7 +623,17 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION

-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
+        ;; \
+    esac && \
+    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
@@ -615,10 +650,21 @@ USER root
 #########################################################################################

 FROM rust-extensions-build AS pg-jsonschema-pg-build
+ARG PG_VERSION

 # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
 # there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -633,12 +679,23 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e
 #########################################################################################

 FROM rust-extensions-build AS pg-graphql-pg-build
+ARG PG_VERSION

 # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
 # Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
 # pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
 # same 1.1 version we've used before.
-RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -656,9 +713,20 @@ RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367
 #########################################################################################

 FROM rust-extensions-build AS pg-tiktoken-pg-build
+ARG PG_VERSION

 # 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    cargo pgx install --release && \
@@ -672,8 +740,19 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
 #########################################################################################

 FROM rust-extensions-build AS pg-pgx-ulid-build
+ARG PG_VERSION

-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -726,6 +805,20 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_rmgr \
+        -s install && \
+    case "${PG_VERSION}" in \
+        "v14" | "v15") \
+        ;; \
+        "v16") \
+            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
+        ;; \
+        *) \
+            echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+        esac && \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/hnsw \
--- a/40
+++ b/40
@@ -29,6 +29,7 @@ else ifeq ($(UNAME_S),Darwin)
 	# It can be configured with OPENSSL_PREFIX variable
 	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
 	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
 	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
@@ -83,6 +84,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
+.PHONY: postgres-configure-v16
+postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
 .PHONY: postgres-configure-v14
@@ -118,6 +121,10 @@ postgres-clean-%:
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean

+.PHONY: postgres-check-%
+postgres-check-%: postgres-%
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
+
 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
 	+@echo "Compiling neon $*"
@@ -130,6 +137,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_rmgr $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
 	+@echo "Compiling neon_test_utils $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -140,6 +152,13 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
+
+# pg_embedding was temporarily released as hnsw from this repo, when we only
+# supported PostgreSQL 14 and 15
+neon-pg-ext-v14: neon-pg-ext-hnsw-v14
+neon-pg-ext-v15: neon-pg-ext-hnsw-v15
+
+neon-pg-ext-hnsw-%: postgres-headers-% postgres-%
 	+@echo "Compiling hnsw $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -167,28 +186,39 @@ neon-pg-ext-clean-%:
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
-	neon-pg-ext-v15
+	neon-pg-ext-v15 \
+	neon-pg-ext-v16

 .PHONY: neon-pg-ext-clean
 neon-pg-ext-clean: \
 	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15
+	neon-pg-ext-clean-v15 \
+	neon-pg-ext-clean-v16

 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
-	postgres-v15
+	postgres-v15 \
+	postgres-v16

 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
-	postgres-headers-v15
+	postgres-headers-v15 \
+	postgres-headers-v16

 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
-	postgres-clean-v15
+	postgres-clean-v15 \
+	postgres-clean-v16
+
+.PHONY: postgres-check
+postgres-check: \
+	postgres-check-v14 \
+	postgres-check-v15 \
+	postgres-check-v16

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
--- a/README.md
+++ b/README.md
@@ -29,18 +29,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry
+libcurl4-openssl-dev openssl python-poetry lsof
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry
+  protobuf-devel libcurl-devel openssl poetry lsof
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf curl
+postgresql-libs cmake postgresql protobuf curl lsof
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf openssl flex bison
+brew install protobuf openssl flex bison icu4c pkg-config

 # add openssl to PATH, required for ed25519 keys generation in neon_local
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
--- a/clippy.toml
+++ b/clippy.toml
@@ -0,0 +1,5 @@
+disallowed-methods = [
+    "tokio::task::block_in_place",
+    # Allow this for now, to deny it later once we stop using Handle::block_on completely
+    # "tokio::runtime::Handle::block_on",
+]
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,12 +1,39 @@
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, Ok, Result};
+use postgres::Client;
 use tokio_postgres::NoTls;
-use tracing::{error, instrument};
+use tracing::{error, instrument, warn};

 use crate::compute::ComputeNode;

+/// Create a special service table for availability checks
+/// only if it does not exist already.
+pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
+    let query = "
+        DO $$
+        BEGIN
+            IF NOT EXISTS(
+                SELECT 1
+                FROM pg_catalog.pg_tables
+                WHERE tablename = 'health_check'
+            )
+            THEN
+            CREATE TABLE health_check (
+                id serial primary key,
+                updated_at timestamptz default now()
+            );
+            INSERT INTO health_check VALUES (1, now())
+                ON CONFLICT (id) DO UPDATE
+                 SET updated_at = now();
+            END IF;
+        END
+        $$;";
+    client.execute(query, &[])?;
+
+    Ok(())
+}
+
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
-/// Create table if it's missing.
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    // Connect to the database.
@@ -24,21 +51,28 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    });

    let query = "
-    CREATE TABLE IF NOT EXISTS health_check (
-        id serial primary key,
-        updated_at timestamptz default now()
-    );
    INSERT INTO health_check VALUES (1, now())
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";

-    let result = client.simple_query(query).await?;
-
-    if result.len() != 2 {
-        return Err(anyhow::format_err!(
-            "expected 2 query results, but got {}",
-            result.len()
-        ));
+    match client.simple_query(query).await {
+        Result::Ok(result) => {
+            if result.len() != 1 {
+                return Err(anyhow::anyhow!(
+                    "expected 1 query results, but got {}",
+                    result.len()
+                ));
+            }
+        }
+        Err(err) => {
+            if let Some(state) = err.code() {
+                if state == &tokio_postgres::error::SqlState::DISK_FULL {
+                    warn!("Tenant disk is full");
+                    return Ok(());
+                }
+            }
+            return Err(err.into());
+        }
    }

    Ok(())
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -27,6 +27,7 @@ use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};

+use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -696,6 +697,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -1078,7 +1080,8 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -46,8 +46,6 @@ pub fn write_postgres_conf(
        writeln!(file, "{}", conf)?;
    }

-    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
-
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -74,6 +74,7 @@ More specifically, here is an example ext_index.json
 use anyhow::Context;
 use anyhow::{self, Result};
 use compute_api::spec::RemoteExtSpec;
+use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
@@ -106,16 +107,71 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {

 pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
-    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("15") {
-        return "v15".to_string();
-    } else if human_version.contains("14") {
-        return "v14".to_string();
+    return parse_pg_version(&human_version).to_string();
+}
+
+fn parse_pg_version(human_version: &str) -> &str {
+    // Normal releases have version strings like "PostgreSQL 15.4". But there
+    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
+    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
+    // configure option, you can tack any string to the version number,
+    // e.g. "PostgreSQL 15.4foobar".
+    match Regex::new(r"^PostgreSQL (?<major>\d+).+")
+        .unwrap()
+        .captures(human_version)
+    {
+        Some(captures) if captures.len() == 2 => match &captures["major"] {
+            "14" => return "v14",
+            "15" => return "v15",
+            "16" => return "v16",
+            _ => {}
+        },
+        _ => {}
    }
    panic!("Unsuported postgres version {human_version}");
 }

+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
+}
+
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
@@ -180,7 +236,19 @@ pub async fn download_extension(
 // Create extension control files from spec
 pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    for ext_data in remote_extensions.extension_data.values() {
+    for (ext_name, ext_data) in remote_extensions.extension_data.iter() {
+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &remote_extensions.public_extensions {
+            if !public_extensions.contains(ext_name) {
+                if let Some(custom_extensions) = &remote_extensions.custom_extensions {
+                    if !custom_extensions.contains(ext_name) {
+                        continue; // skip this extension, it is not allowed
+                    }
+                }
+            }
+        }
+
        for (control_name, control_content) in &ext_data.control_data {
            let control_path = local_sharedir.join(control_name);
            if !control_path.exists() {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,4 +1,6 @@
 use std::convert::Infallible;
+use std::net::IpAddr;
+use std::net::Ipv6Addr;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
@@ -169,7 +171,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    }
                };

-                remote_extensions.get_ext(&filename, is_library)
+                remote_extensions.get_ext(
+                    &filename,
+                    is_library,
+                    &compute.build_tag,
+                    &compute.pgversion,
+                )
            };

            match ext {
@@ -293,7 +300,9 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], port));
+    // this usually binds to both IPv4 and IPv6 on linux
+    // see e.g. https://github.com/rust-lang/rust/pull/34440
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
 //   https://www.postgresql.org/docs/15/auth-password.html
 //
 // So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
-pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
+pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,6 +12,7 @@ git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
+hex.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,6 +1,7 @@
 # Minimal neon environment with one safekeeper. This is equivalent to the built-in
 # defaults that you get with no --config
-[pageserver]
+[[pageservers]]
+id=1
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
 pg_auth_type = 'Trust'
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,7 +1,7 @@
 use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
-use pageserver_api::control_api::HexTenantId;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};

@@ -13,9 +13,11 @@ pub struct AttachmentService {

 const COMMAND: &str = "attachment_service";

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
-    pub tenant_id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
    pub pageserver_id: Option<NodeId>,
 }

@@ -30,7 +32,7 @@ impl AttachmentService {

        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
-        let listen_url = env.pageserver.control_plane_api.clone().unwrap();
+        let listen_url = env.control_plane_api.clone().unwrap();

        let listen = format!(
            "{}:{}",
@@ -78,7 +80,6 @@ impl AttachmentService {

        let url = self
            .env
-            .pageserver
            .control_plane_api
            .clone()
            .unwrap()
@@ -89,13 +90,13 @@ impl AttachmentService {
            .expect("Failed to construct http client");

        let request = AttachHookRequest {
-            tenant_id: HexTenantId::new(tenant_id),
+            tenant_id,
            pageserver_id: Some(pageserver_id),
        };

        let response = client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {0}", response.status()));
+            return Err(anyhow!("Unexpected status {}", response.status()));
        }

        let response = response.json::<AttachHookResponse>()?;
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -6,9 +6,9 @@
 ///
 use anyhow::anyhow;
 use clap::Parser;
+use hex::FromHex;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response};
-use pageserver_api::control_api::*;
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
@@ -25,15 +25,22 @@ use utils::{
    tcp_listener,
 };

+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
+    ValidateResponseTenant,
+};
+
 use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
 struct Cli {
+    /// Host and port to listen on, like `127.0.0.1:1234`
    #[arg(short, long)]
-    listen: String,
+    listen: std::net::SocketAddr,

+    /// Path to the .json file to store state (will be created if it doesn't exist)
    #[arg(short, long)]
    path: PathBuf,
 }
@@ -54,13 +61,10 @@ where
    S: serde::Serializer,
    V: Clone + Serialize,
 {
-    eprintln!("to_hex_map");
-    let transformed = input
-        .iter()
-        .map(|(k, v)| (HexTenantId::new(k.clone()), v.clone()));
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));

    transformed
-        .collect::<HashMap<HexTenantId, V>>()
+        .collect::<HashMap<String, V>>()
        .serialize(serializer)
 }

@@ -69,10 +73,15 @@ where
    D: serde::de::Deserializer<'de>,
    V: Deserialize<'de>,
 {
-    eprintln!("from_hex_map");
-    let hex_map = HashMap::<HexTenantId, V>::deserialize(deserializer)?;
-
-    Ok(hex_map.into_iter().map(|(k, v)| (k.take(), v)).collect())
+    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
+    hex_map
+        .into_iter()
+        .map(|(k, v)| {
+            TenantId::from_hex(k)
+                .map(|k| (k, v))
+                .map_err(serde::de::Error::custom)
+        })
+        .collect()
 }

 // Top level state available to all HTTP handlers
@@ -102,17 +111,24 @@ impl PersistentState {

    async fn load_or_new(path: &Path) -> Self {
        match Self::load(path).await {
-            Ok(s) => s,
-            Err(e) => {
-                tracing::info!(
-                    "Creating new state file at {0} (load returned {e})",
-                    path.to_string_lossy()
-                );
+            Ok(s) => {
+                tracing::info!("Loaded state file at {}", path.display());
+                s
+            }
+            Err(e)
+                if e.downcast_ref::<std::io::Error>()
+                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
+                    .unwrap_or(false) =>
+            {
+                tracing::info!("Will create state file at {}", path.display());
                Self {
                    tenants: HashMap::new(),
                    path: path.to_owned(),
                }
            }
+            Err(e) => {
+                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
+            }
        }
    }
 }
@@ -153,16 +169,13 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
        if state.pageserver == Some(reattach_req.node_id) {
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
-                id: HexTenantId::new(t.clone()),
+                id: *t,
                generation: state.generation,
            });
        }
    }

-    locked
-        .save()
-        .await
-        .map_err(|e| ApiError::InternalServerError(e))?;
+    locked.save().await.map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, response)
 }
@@ -172,15 +185,14 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let validate_req = json_request::<ValidateRequest>(&mut req).await?;

-    let state = get_state(&req).inner.clone();
-    let locked = state.read().await;
+    let locked = get_state(&req).inner.read().await;

    let mut response = ValidateResponse {
        tenants: Vec::new(),
    };

    for req_tenant in validate_req.tenants {
-        if let Some(tenant_state) = locked.tenants.get(req_tenant.id.as_ref()) {
+        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
            let valid = tenant_state.generation == req_tenant.gen;
            response.tenants.push(ValidateResponseTenant {
                id: req_tenant.id,
@@ -202,7 +214,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap

    let tenant_state = locked
        .tenants
-        .entry(attach_req.tenant_id.take())
+        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
            pageserver: attach_req.pageserver_id,
            generation: 0,
@@ -213,10 +225,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    }
    let generation = tenant_state.generation;

-    locked
-        .save()
-        .await
-        .map_err(|e| ApiError::InternalServerError(e))?;
+    locked.save().await.map_err(ApiError::InternalServerError)?;

    json_response(
        StatusCode::OK,
@@ -229,9 +238,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| handle_re_attach(r))
-        .post("/validate", |r| handle_validate(r))
-        .post("/attach_hook", |r| handle_attach_hook(r))
+        .post("/re-attach", handle_re_attach)
+        .post("/validate", handle_validate)
+        .post("/attach_hook", handle_attach_hook)
 }

 #[tokio::main]
@@ -250,14 +259,14 @@ async fn main() -> anyhow::Result<()> {

    let persistent_state = PersistentState::load_or_new(&args.path).await;

-    let http_listener = tcp_listener::bind(&args.listen)?;
+    let http_listener = tcp_listener::bind(args.listen)?;
    let router = make_router(persistent_state)
        .build()
        .map_err(|err| anyhow!(err))?;
    let service = utils::http::RouterService::new(router).unwrap();
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

-    tracing::info!("Serving on {0}", args.listen.as_str());
+    tracing::info!("Serving on {0}", args.listen);
    server.await?;

    Ok(())
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -50,16 +50,17 @@ fn default_conf() -> String {
    format!(
        r#"
 # Default built-in configuration, defined in main.rs
+control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
+
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[pageserver]
+[[pageservers]]
 id = {DEFAULT_PAGESERVER_ID}
 listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
 listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
 pg_auth_type = '{trust_auth}'
 http_auth_type = '{trust_auth}'
-control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'

 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
@@ -258,7 +259,7 @@ fn get_timeline_infos(
    env: &local_env::LocalEnv,
    tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
-    Ok(PageServerNode::from_env(env)
+    Ok(get_default_pageserver(env)
        .timeline_list(tenant_id)?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -319,17 +320,30 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
-    let pageserver = PageServerNode::from_env(&env);
-    pageserver
-        .initialize(&pageserver_config_overrides(init_match))
-        .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {e:?}");
-            exit(1);
-        });
+    for ps_conf in &env.pageservers {
+        PageServerNode::from_env(&env, ps_conf)
+            .initialize(&pageserver_config_overrides(init_match))
+            .unwrap_or_else(|e| {
+                eprintln!("pageserver init failed: {e:?}");
+                exit(1);
+            });
+    }

    Ok(env)
 }

+/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
+/// For typical interactive use, one would just run with a single pageserver.  Scenarios with
+/// tenant/timeline placement across multiple pageservers are managed by python test code rather
+/// than this CLI.
+fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
+    let ps_conf = env
+        .pageservers
+        .first()
+        .expect("Config is validated to contain at least one pageserver");
+    PageServerNode::from_env(env, ps_conf)
+}
+
 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
        .get_many::<String>("pageserver-config-override")
@@ -340,7 +354,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
            for t in pageserver.tenant_list()? {
@@ -354,13 +368,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .unwrap_or_default();

            // If tenant ID was not specified, generate one
-            let tenant_id = parse_tenant_id(create_match)?.unwrap_or(TenantId::generate());
+            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            let generation = if env.pageserver.control_plane_api.is_some() {
+            let generation = if env.control_plane_api.is_some() {
                // We must register the tenant with the attachment service, so
                // that when the pageserver restarts, it will be re-attached.
                let attachment_service = AttachmentService::from_env(env);
-                attachment_service.attach_hook(tenant_id, env.pageserver.id)?
+                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
            } else {
                None
            };
@@ -425,7 +439,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
 }

 fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
@@ -502,6 +516,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                pg_version,
                ComputeMode::Primary,
+                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -555,7 +570,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
        Some(ep_subcommand_data) => ep_subcommand_data,
        None => bail!("no endpoint subcommand provided"),
    };
-
    let mut cplane = ComputeControlPlane::load(env.clone())?;

    // All subcommands take an optional --tenant-id option
@@ -652,6 +666,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -667,6 +688,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                http_port,
                pg_version,
                mode,
+                pageserver_id,
            )?;
        }
        "start" => {
@@ -676,6 +698,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

            // If --safekeepers argument is given, use only the listed safekeeper nodes.
@@ -695,7 +724,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(

            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

-            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
+            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
@@ -762,6 +792,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    http_port,
                    pg_version,
                    mode,
+                    pageserver_id,
                )?;
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
@@ -786,48 +817,64 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
 }

 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+            NodeId(id_str.parse().context("while parsing pageserver id")?)
+        } else {
+            DEFAULT_PAGESERVER_ID
+        };
+
+        Ok(PageServerNode::from_env(
+            env,
+            env.get_pageserver_conf(node_id)?,
+        ))
+    }

    match sub_match.subcommand() {
-        Some(("start", start_match)) => {
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
+        Some(("start", subcommand_args)) => {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(&pageserver_config_overrides(subcommand_args))
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

-        Some(("stop", stop_match)) => {
-            let immediate = stop_match
+        Some(("stop", subcommand_args)) => {
+            let immediate = subcommand_args
                .get_one::<String>("stop-mode")
                .map(|s| s.as_str())
                == Some("immediate");

-            if let Err(e) = pageserver.stop(immediate) {
+            if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) {
                eprintln!("pageserver stop failed: {}", e);
                exit(1);
            }
        }

-        Some(("restart", restart_match)) => {
+        Some(("restart", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
            //TODO what shutdown strategy should we use here?
            if let Err(e) = pageserver.stop(false) {
                eprintln!("pageserver stop failed: {}", e);
                exit(1);
            }

-            if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

-        Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
-            Ok(_) => println!("Page server is up and running"),
-            Err(err) => {
-                eprintln!("Page server is not available: {}", err);
-                exit(1);
+        Some(("status", subcommand_args)) => {
+            match get_pageserver(env, subcommand_args)?.check_status() {
+                Ok(_) => println!("Page server is up and running"),
+                Err(err) => {
+                    eprintln!("Page server is not available: {}", err);
+                    exit(1);
+                }
            }
-        },
+        }

        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
        None => bail!("no pageserver subcommand provided"),
@@ -943,7 +990,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
    broker::start_broker_process(env)?;

    // Only start the attachment service if the pageserver is configured to need it
-    if env.pageserver.control_plane_api.is_some() {
+    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
        if let Err(e) = attachment_service.start() {
            eprintln!("attachment_service start failed: {:#}", e);
@@ -952,11 +999,13 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
        }
    }

-    let pageserver = PageServerNode::from_env(env);
-    if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
-        try_stop_all(env, true);
-        exit(1);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
+            try_stop_all(env, true);
+            exit(1);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -980,8 +1029,6 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
 }

 fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
-    let pageserver = PageServerNode::from_env(env);
-
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
@@ -996,8 +1043,11 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.stop(immediate) {
+            eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -1011,7 +1061,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        eprintln!("neon broker stop failed: {e:#}");
    }

-    if env.pageserver.control_plane_api.is_some() {
+    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
        if let Err(e) = attachment_service.stop(immediate) {
            eprintln!("attachment service stop failed: {e:#}");
@@ -1031,6 +1081,16 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    // --id, when using a pageserver command
+    let pageserver_id_arg = Arg::new("pageserver-id")
+        .long("id")
+        .help("pageserver id")
+        .required(false);
+    // --pageserver-id when using a non-pageserver command
+    let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id")
+        .long("pageserver-id")
+        .required(false);
+
    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
        .short('e')
        .long("safekeeper-extra-opt")
@@ -1195,10 +1255,16 @@ fn cli() -> Command {
                .arg_required_else_help(true)
                .about("Manage pageserver")
                .subcommand(Command::new("status"))
-                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .arg(pageserver_id_arg.clone())
+                .subcommand(Command::new("start").about("Start local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .arg(pageserver_id_arg.clone())
                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
        )
        .subcommand(
            Command::new("attachment_service")
@@ -1242,6 +1308,7 @@ fn cli() -> Command {
                    .arg(lsn_arg.clone())
                    .arg(pg_port_arg.clone())
                    .arg(http_port_arg.clone())
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1259,6 +1326,7 @@ fn cli() -> Command {
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -70,6 +70,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
+    pageserver_id: NodeId,
 }

 //
@@ -82,19 +83,16 @@ pub struct ComputeControlPlane {
    pub endpoints: BTreeMap<String, Arc<Endpoint>>,

    env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
 }

 impl ComputeControlPlane {
    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        let pageserver = Arc::new(PageServerNode::from_env(&env));
-
        let mut endpoints = BTreeMap::default();
        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
+            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

@@ -102,7 +100,6 @@ impl ComputeControlPlane {
            base_port: 55431,
            endpoints,
            env,
-            pageserver,
        })
    }

@@ -125,20 +122,29 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
+        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let pageserver =
+            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver: Arc::clone(&self.pageserver),
+            pageserver,
            timeline_id,
            mode,
            tenant_id,
            pg_version,
-            skip_pg_catalog_updates: false,
+            // We don't setup roles and databases in the spec locally, so we don't need to
+            // do catalog updates. Catalog updates also include check availability
+            // data creation. Yet, we have tests that check that size and db dump
+            // before and after start are the same. So, skip catalog updates,
+            // with this we basically test a case of waking up an idle compute, where
+            // we also skip catalog updates in the cloud.
+            skip_pg_catalog_updates: true,
        });

        ep.create_endpoint_dir()?;
@@ -152,7 +158,8 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
-                skip_pg_catalog_updates: false,
+                skip_pg_catalog_updates: true,
+                pageserver_id,
            })?,
        )?;
        std::fs::write(
@@ -187,18 +194,14 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
+    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
-    fn from_dir_entry(
-        entry: std::fs::DirEntry,
-        env: &LocalEnv,
-        pageserver: &Arc<PageServerNode>,
-    ) -> Result<Endpoint> {
+    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
            anyhow::bail!(
                "Endpoint::from_dir_entry failed: '{}' is not a directory",
@@ -214,12 +217,15 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

+        let pageserver =
+            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
+
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver: Arc::clone(pageserver),
+            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -68,11 +68,17 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    pub pageserver: PageServerConf,
+    /// This Vec must always contain at least one pageserver
+    pub pageservers: Vec<PageServerConf>,

    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,

+    // Control plane location: if None, we will not run attachment_service.  If set, this will
+    // be propagated into each pageserver's configuration.
+    #[serde(default)]
+    pub control_plane_api: Option<Url>,
+
    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
    #[serde(default)]
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
@@ -118,9 +124,6 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-
-    // Control plane location
-    pub control_plane_api: Option<Url>,
 }

 impl Default for PageServerConf {
@@ -131,7 +134,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            control_plane_api: None,
        }
    }
 }
@@ -180,26 +182,18 @@ impl LocalEnv {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 => Ok(path.join(format!("v{pg_version}"))),
-            15 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    pub fn pageserver_bin(&self) -> PathBuf {
@@ -222,15 +216,23 @@ impl LocalEnv {
        self.base_data_dir.join("endpoints")
    }

-    // TODO: move pageserver files into ./pageserver
-    pub fn pageserver_data_dir(&self) -> PathBuf {
-        self.base_data_dir.clone()
+    pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf {
+        self.base_data_dir
+            .join(format!("pageserver_{pageserver_id}"))
    }

    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

+    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
+        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
+            Ok(conf)
+        } else {
+            bail!("could not find pageserver {id}")
+        }
+    }
+
    pub fn register_branch_mapping(
        &mut self,
        branch_name: String,
@@ -307,6 +309,10 @@ impl LocalEnv {
            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }

+        if env.pageservers.is_empty() {
+            anyhow::bail!("Configuration must contain at least one pageserver");
+        }
+
        env.base_data_dir = base_path();

        Ok(env)
@@ -339,7 +345,7 @@ impl LocalEnv {
        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
        // a bit sad.
-        let mut conf_content = r#"# This file describes a locale deployment of the page server
+        let mut conf_content = r#"# This file describes a local deployment of the page server
 # and safekeeeper node. It is read by the 'neon_local' command-line
 # utility.
 "#
@@ -469,9 +475,9 @@ impl LocalEnv {
    }

    fn auth_keys_needed(&self) -> bool {
-        self.pageserver.pg_auth_type == AuthType::NeonJWT
-            || self.pageserver.http_auth_type == AuthType::NeonJWT
-            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        self.pageservers.iter().any(|ps| {
+            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
+        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
    }
 }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -27,6 +27,7 @@ use utils::{
    lsn::Lsn,
 };

+use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
@@ -76,43 +77,40 @@ impl ResponseErrorMessageExt for Response {
 #[derive(Debug)]
 pub struct PageServerNode {
    pub pg_connection_config: PgConnectionConfig,
+    pub conf: PageServerConf,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
 }

 impl PageServerNode {
-    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
-            .expect("Unable to parse listen_pg_addr");
+    pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode {
+        let (host, port) =
+            parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);
        Self {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
+            conf: conf.clone(),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
+            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }

    // pageserver conf overrides defined by neon_local configuration.
    fn neon_local_overrides(&self) -> Vec<String> {
-        let id = format!("id={}", self.env.pageserver.id);
+        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let http_auth_type_param =
-            format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
-        let listen_http_addr_param = format!(
-            "listen_http_addr='{}'",
-            self.env.pageserver.listen_http_addr
-        );
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);

-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
-        let listen_pg_addr_param =
-            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -126,17 +124,18 @@ impl PageServerNode {
            broker_endpoint_param,
        ];

-        if let Some(control_plane_api) = &self.env.pageserver.control_plane_api {
+        if let Some(control_plane_api) = &self.env.control_plane_api {
            overrides.push(format!(
                "control_plane_api='{}'",
                control_plane_api.as_str()
            ));
        }

-        if self.env.pageserver.http_auth_type != AuthType::Trust
-            || self.env.pageserver.pg_auth_type != AuthType::Trust
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
-            overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
+            // Keys are generated in the toplevel repo dir, pageservers' workdirs
+            // are one level below that, so refer to keys with ../
+            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
        overrides
    }
@@ -144,16 +143,12 @@ impl PageServerNode {
    /// Initializes a pageserver node by creating its config with the overrides provided.
    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
-        self.pageserver_init(config_overrides).with_context(|| {
-            format!(
-                "Failed to run init for pageserver node {}",
-                self.env.pageserver.id,
-            )
-        })
+        self.pageserver_init(config_overrides)
+            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
    }

    pub fn repo_path(&self) -> PathBuf {
-        self.env.pageserver_data_dir()
+        self.env.pageserver_data_dir(self.conf.id)
    }

    /// The pid file is created by the pageserver process, with its pid stored inside.
@@ -169,7 +164,7 @@ impl PageServerNode {

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        let datadir = self.repo_path();
-        let node_id = self.env.pageserver.id;
+        let node_id = self.conf.id;
        println!(
            "Initializing pageserver node {} at '{}' in {:?}",
            node_id,
@@ -178,6 +173,10 @@ impl PageServerNode {
        );
        io::stdout().flush()?;

+        if !datadir.exists() {
+            std::fs::create_dir(&datadir)?;
+        }
+
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
        })?;
@@ -208,7 +207,7 @@ impl PageServerNode {
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
-            self.env.pageserver.id,
+            self.conf.id,
            self.pg_connection_config.raw_address(),
            datadir
        );
@@ -217,7 +216,7 @@ impl PageServerNode {
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!(
                "Cannot start pageserver node {} in path that has no string representation: {:?}",
-                self.env.pageserver.id, datadir,
+                self.conf.id, datadir,
            )
        })?;
        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
@@ -261,7 +260,7 @@ impl PageServerNode {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
        // the pageserver requires a token in incoming requests.
-        Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
+        Ok(if self.conf.http_auth_type != AuthType::Trust {
            // Generate a token to connect from the pageserver to a safekeeper
            let token = self
                .env
@@ -286,7 +285,7 @@ impl PageServerNode {

    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
        let mut config = self.pg_connection_config.clone();
-        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
+        if self.conf.pg_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -297,7 +296,7 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
        let mut builder = self.http_client.request(method, url);
-        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
+        if self.conf.http_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,7 +30,7 @@ cleanup() {
 echo "clean up containers if exists"
 cleanup

-for pg_version in 14 15; do
+for pg_version in 14 15 16; do
    echo "start containers (pg_version=$pg_version)."
    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

--- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
+++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
@@ -0,0 +1,281 @@
+
+# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
+
+* Created on: Aug 23, 2023
+* Author: Christian Schwarz
+
+## Summary
+
+This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
+Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
+
+## Motivation
+
+### Background
+
+We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
+If we crash or restart pageserver, we reconstruct the layer map from:
+1. local timeline directory contents
+2. remote `index_part.json` contents.
+
+The function that is responsible for this is called `Timeline::load_layer_map()`.
+The reconciliation process's behavior is the following:
+* local-only files will become part of the layer map as local-only layers and rescheduled for upload
+* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
+
+### The Problem
+
+There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
+The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
+As stated above, making the update to the layer map in atomic way is trivial.
+But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
+Currently, we issue the system calls one by one and hope we don't crash.
+
+What happens if we crash and restart in the middle of that system call sequence?
+We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
+
+We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
+
+### Problem's Implications For Compaction
+
+The implications of the above are primarily problematic for compaction.
+Specifically, the part of it that compacts L0 layers into L1 layers.
+
+Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
+Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
+It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
+
+If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
+This means the compaction after restart will **overwrite** the previously written L1s.
+Currently we also schedule an S3 upload of the overwritten L1.
+
+If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
+
+*However*:
+1. the file size of the overwritten L1s may not be identical, and
+2. the bit pattern of the overwritten L1s may not be identical, and,
+3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
+
+The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
+
+For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
+But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
+That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
+If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
+Effectively, the data in the L1 has become inaccessible to node B.
+If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
+
+If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
+
+In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
+
+But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
+
+Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
+**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
+
+## Design
+
+Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
+Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
+
+During **timeline load**, the only thing that matters is the remote index part content.
+Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
+The local timeline dir's `metadata` file does not matter.
+The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
+Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
+The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
+Instead, it treats the remote index part as the authoritative layer map.
+If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
+If it doesn't match, we remove the file from the local timeline dir.
+
+After load, **at runtime**, nothing changes compared to what we did before this RFC.
+The procedure for single- and multi-object changes is reproduced here for reference:
+* For any new layers that the change adds:
+  * Write them to a temporary location.
+  * While holding layer map lock:
+    * Move them to the final location.
+    * Insert into layer map.
+* Make the S3 changes.
+  We won't reproduce the remote timeline client method calls here because these are subject to change.
+  Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
+    * PUT layer files inserted by the change.
+    * PUT an index part that has insertions and deletions of the change.
+    * DELETE the layer files that are deleted by the change.
+
+Note that it is safe for the DELETE to be deferred arbitrarily.
+* If it never happens, we leak the object, but, that's not a correctness concern.
+* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
+* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
+
+## How This Solves The Problem
+
+If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
+The S3 change sequence above is obviously crash-consistent.
+If we crash before the index part PUT, then we leak the inserted layer files to S3.
+If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
+Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
+
+Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
+* atomic layer map update at runtime, currently by using an RwLock in write mode
+* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
+* local timeline dir state:
+  * irrelevant for layer map content => irrelevant for atomic updates / crash consistency
+  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
+  * if we crash before index part PUT, local layer files will be deleted
+
+## Trade-Offs
+
+### Fundamental
+
+If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
+* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
+* compaction: we lose the entire compaction iteration work; need to re-do it again
+* gc: no change to what we have today
+
+If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
+The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
+Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
+* on-demand downloads that were needed to do the work: are likely still present, not lost
+* wal ingest: currently unbounded
+* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
+  * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
+  * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
+* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
+  * I have no intuition how expensive / long-running it is in reality.
+* gc: `update_gc_info`` work (not substantial, AFAIK)
+
+To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
+However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
+We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
+However, this RFC is not constraining the design space either.
+
+### Practical
+
+#### Pageserver Restarts
+
+Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
+However, regular pageserver restart happen frequently, e.g., during weekly deploys.
+
+In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
+They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
+We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
+A longer budget would expose tenants that are done early to a longer downtime.
+A short budget would risk throwing away more work that'd have to be re-done after restart.
+
+In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
+We can mitigate this problem as follows:
+0. initially, by accepting that we need to do the work again
+1. short-term, introducing measures to cap the amount of in-flight work:
+
+   - cap upload queue length, use backpressure to slow down compaction
+   - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
+   - introducing a read-only shutdown state for tenants that are fast to shut down;
+     that state would be equivalent to the state of a tenant in hot standby / readonly mode.
+
+2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
+
+#### `disk_consistent_lsn` can go backwards
+
+`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
+Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
+Compute certainly doesn't care about `disk_consistent_lsn`.
+
+
+## Side-Effects Of This Design
+
+* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
+
+## Limitations
+
+Multi-object changes that span multiple timelines aren't covered by this RFC.
+That's fine because we currently don't need them, as evidenced by the absence
+of a Pageserver operation that holds multiple timelines' layer map lock at a time.
+
+## Impacted components
+
+Primarily pageservers.
+
+Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
+No changes to safekeepers are needed.
+
+## Alternatives considered
+
+### Alternative 1: WAL
+
+We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
+The WAL would be used to
+1. make multi-object changes atomic
+2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
+
+The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
+* New on-disk state to get right.
+* Forward- and backward-compatibility development costs in the future.
+
+### Alternative 2: Flow Everything Through `index_part.json`
+
+We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
+I.e., layer map would always be the last-persisted S3 state.
+That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
+And it might make hot standbys / read-only pageservers less of a special case in the future.
+
+But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
+
+And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
+
+Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
+
+### Alternative 3: Sequence Numbers For Layers
+
+Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
+
+```
+# before
+tenants/$tenant/timelines/$timeline/$key_and_lsn_range
+# after
+tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
+```
+
+To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
+
+This alternative does not solve atomic layer map updates.
+In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
+In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
+We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
+
+However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
+
+So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
+But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
+The proposed design in this RFC addresses both.
+
+So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
+That way, we avoid a phase where the crash-during-compaction problem is accute.
+
+## Related issues
+
+- https://github.com/neondatabase/neon/issues/4749
+- https://github.com/neondatabase/neon/issues/4418
+  - https://github.com/neondatabase/neon/pull/4422
+- https://github.com/neondatabase/neon/issues/5077
+- https://github.com/neondatabase/neon/issues/4088
+  - (re)resolutions:
+    - https://github.com/neondatabase/neon/pull/4696
+    - https://github.com/neondatabase/neon/pull/4094
+      - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
+
+Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that  assumption in order to fix the problem.
+
+
+## Implementation Plan
+
+1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
+
+    - The nasty part here is to fix all the tests that fiddle with the local timeline directory.
+      Possibly they are just irrelevant with this change, but, each case will require inspection.
+
+2. Implement the design above.
+
+    - Initially, ship without the mitigations for restart and accept we will do some work twice.
+    - Measure the impact and implement one of the mitigations.
+
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -89,6 +89,8 @@ impl RemoteExtSpec {
        &self,
        ext_name: &str,
        is_library: bool,
+        build_tag: &str,
+        pg_major_version: &str,
    ) -> anyhow::Result<(String, RemotePath)> {
        let mut real_ext_name = ext_name;
        if is_library {
@@ -104,11 +106,32 @@ impl RemoteExtSpec {
                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
        }

+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &self.public_extensions {
+            if !public_extensions.contains(&real_ext_name.to_string()) {
+                if let Some(custom_extensions) = &self.custom_extensions {
+                    if !custom_extensions.contains(&real_ext_name.to_string()) {
+                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
+                    }
+                }
+            }
+        }
+
        match self.extension_data.get(real_ext_name) {
-            Some(ext_data) => Ok((
-                real_ext_name.to_string(),
-                RemotePath::from_string(&ext_data.archive_path)?,
-            )),
+            Some(_ext_data) => {
+                // Construct the path to the extension archive
+                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+                //
+                // Keep it in sync with path generation in
+                // https://github.com/neondatabase/build-custom-extensions/tree/main
+                let archive_path_str =
+                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                Ok((
+                    real_ext_name.to_string(),
+                    RemotePath::from_string(&archive_path_str)?,
+                ))
+            }
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -12,7 +12,6 @@ const_format.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
-hex.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true
 enum-map.workspace = true
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -1,63 +1,22 @@
-/// Types in this file are for pageserver's upward-facing API calls to the control plane
-use hex::FromHex;
+//! Types in this file are for pageserver's upward-facing API calls to the control plane,
+//! required for acquiring and validating tenant generation numbers.
+//!
+//! See docs/rfcs/025-generation-numbers.md
+
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

-/// TenantId's serialization is an array of u8, which is rather unfriendly
-/// for outside callers who aren't working with the native Rust TenantId.
-/// This class wraps it in serialization that is just the hex strict
-/// representation.
-#[derive(Eq, PartialEq, Clone, Hash)]
-pub struct HexTenantId(TenantId);
-
-impl HexTenantId {
-    pub fn new(t: TenantId) -> Self {
-        Self(t)
-    }
-
-    pub fn take(self) -> TenantId {
-        self.0
-    }
-}
-
-impl AsRef<TenantId> for HexTenantId {
-    fn as_ref(&self) -> &TenantId {
-        &self.0
-    }
-}
-
-impl Serialize for HexTenantId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        let hex = self.0.hex_encode();
-        serializer.collect_str(&hex)
-    }
-}
-
-impl<'de> Deserialize<'de> for HexTenantId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let string = String::deserialize(deserializer)?;
-        TenantId::from_hex(string)
-            .map(|t| HexTenantId::new(t))
-            .map_err(|e| serde::de::Error::custom(format!("{e}")))
-    }
-}
-
-// Top level s
-
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    pub id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
    pub generation: u32,
 }

@@ -66,9 +25,11 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    pub id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
    pub gen: u32,
 }

@@ -82,8 +43,10 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    pub id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -201,6 +201,15 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLoadRequest {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+}
+
 impl std::ops::Deref for TenantCreateRequest {
    type Target = TenantConfig;

@@ -372,6 +381,8 @@ pub struct TimelineInfo {
    pub pg_version: u32,

    pub state: TimelineState,
+
+    pub walreceiver_status: String,
 }

 #[derive(Debug, Clone, Serialize)]
--- a/libs/postgres_ffi/README.md
+++ b/libs/postgres_ffi/README.md
@@ -10,9 +10,11 @@ should be auto-generated too, but that's a TODO.
 The PostgreSQL on-disk file format is not portable across different
 CPU architectures and operating systems. It is also subject to change
 in each major PostgreSQL version. Currently, this module supports
-PostgreSQL v14 and v15: bindings and code that depends on them are version-specific.
-This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15`
-Version independend code is explicitly exported into shared `postgres_ffi`.
+PostgreSQL v14, v15 and v16: bindings and code that depends on them are
+version-specific.
+This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and
+`postgres_ffi::v16`. Version independent code is explicitly exported into
+shared `postgres_ffi`.


 TODO: Currently, there is also some code that deals with WAL records
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
        PathBuf::from("pg_install")
    };

-    for pg_version in &["v14", "v15"] {
+    for pg_version in &["v14", "v15", "v16"] {
        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
        if pg_install_dir_versioned.is_relative() {
            let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> {
            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
+            .allowlist_type("RelMapFile")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -51,11 +51,59 @@ macro_rules! for_all_postgres_versions {
    ($macro:tt) => {
        $macro!(v14);
        $macro!(v15);
+        $macro!(v16);
    };
 }

 for_all_postgres_versions! { postgres_ffi }

+/// dispatch_pgversion
+///
+/// Run a code block in a context where the postgres_ffi bindings for a
+/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv
+/// identifier.
+/// If the provided pg_version is not supported, we panic!(), unless the
+/// optional third argument was provided (in which case that code will provide
+/// the default handling instead).
+///
+/// Use like
+///
+/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE })
+/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE)
+///
+/// Other uses are for macro-internal purposes only and strictly unsupported.
+///
+#[macro_export]
+macro_rules! dispatch_pgversion {
+    ($version:expr, $code:expr) => {
+        dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version))
+    };
+    ($version:expr, $code:expr, $invalid_pgver_handling:expr) => {
+        dispatch_pgversion!(
+            $version => $code,
+            default = $invalid_pgver_handling,
+            pgversions = [
+                14 : v14,
+                15 : v15,
+                16 : v16,
+            ]
+        )
+    };
+    ($pgversion:expr => $code:expr,
+     default = $default:expr,
+     pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
+        match ($pgversion) {
+            $($sv => {
+                use $crate::$vsv as pgv;
+                $code
+            },)+
+            _ => {
+                $default
+            }
+        }
+    };
+}
+
 pub mod pg_constants;
 pub mod relfile_utils;

@@ -90,13 +138,7 @@ pub use v14::xlog_utils::XLogFileName;
 pub use v14::bindings::DBState_DB_SHUTDOWNED;

 pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
-    match version {
-        14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
-        15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
-        _ => anyhow::bail!("Unknown version {}", version),
-    }
+    dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
 }

 pub fn generate_wal_segment(
@@ -107,11 +149,11 @@ pub fn generate_wal_segment(
 ) -> Result<Bytes, SerializeError> {
    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));

-    match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        _ => Err(SerializeError::BadInput),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        Err(SerializeError::BadInput)
+    )
 }

 pub fn generate_pg_control(
@@ -120,11 +162,11 @@ pub fn generate_pg_control(
    lsn: Lsn,
    pg_version: u32,
 ) -> anyhow::Result<(Bytes, u64)> {
-    match pg_version {
-        14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        _ => anyhow::bail!("Unknown version {}", pg_version),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
+        anyhow::bail!("Unknown version {}", pg_version)
+    )
 }

 // PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
@@ -196,8 +238,6 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
 }

 pub mod waldecoder {
-
-    use crate::{v14, v15};
    use bytes::{Buf, Bytes, BytesMut};
    use std::num::NonZeroU32;
    use thiserror::Error;
@@ -248,22 +288,17 @@ pub mod waldecoder {
        }

        pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-            match self.pg_version {
-                // This is a trick to support both versions simultaneously.
-                // See WalStreamDecoderHandler comments.
-                14 => {
-                    use self::v14::waldecoder_handler::WalStreamDecoderHandler;
+            dispatch_pgversion!(
+                self.pg_version,
+                {
+                    use pgv::waldecoder_handler::WalStreamDecoderHandler;
                    self.poll_decode_internal()
-                }
-                15 => {
-                    use self::v15::waldecoder_handler::WalStreamDecoderHandler;
-                    self.poll_decode_internal()
-                }
-                _ => Err(WalDecodeError {
+                },
+                Err(WalDecodeError {
                    msg: format!("Unknown version {}", self.pg_version),
                    lsn: self.lsn,
-                }),
-            }
+                })
+            )
        }
    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -163,6 +163,20 @@ pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
 pub const RM_LOGICALMSG_ID: u8 = 21;

+// from neon_rmgr.h
+pub const RM_NEON_ID: u8 = 134;
+
+pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80;
+
+pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00;
+pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10;
+pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20;
+pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30;
+pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40;
+pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50;
+
+pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40;
+
 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
 pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -3,3 +3,8 @@ pub const XLOG_DBASE_DROP: u8 = 0x10;

 pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -1,10 +1,18 @@
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
-pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
 pub const XLOG_DBASE_DROP: u8 = 0x20;

 pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
 pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -0,0 +1,18 @@
+pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
+
+pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
+pub const XLOG_DBASE_DROP: u8 = 0x20;
+
+pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
+pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -49,9 +49,9 @@ impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match self.pg_version {
-            14 => Ok(path.join(format!("v{}", self.pg_version))),
-            15 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }
@@ -250,11 +250,18 @@ fn craft_internal<C: postgres::GenericClient>(
    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
-        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
-            Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
-            Ordering::Equal => last_lsn,
-            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
-        },
+        Some(last_lsn) => {
+            let insert_lsn = client.pg_current_wal_insert_lsn()?;
+            match last_lsn.cmp(&insert_lsn) {
+                Ordering::Less => bail!(
+                    "Some records were inserted after the crafted WAL: {} vs {}",
+                    last_lsn,
+                    insert_lsn
+                ),
+                Ordering::Equal => last_lsn,
+                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+            }
+        }
    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
@@ -363,8 +370,9 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        );
        ensure!(
            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}",
-            after_xlog_switch
+            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
+            after_xlog_switch,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -959,7 +959,7 @@ mod tests {
        let make_params = |options| StartupMessageParams::new([("options", options)]);

        let params = StartupMessageParams::new([]);
-        assert!(matches!(params.options_escaped(), None));
+        assert!(params.options_escaped().is_none());

        let params = make_params("");
        assert!(split_options(&params).is_empty());
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,14 +13,13 @@ use std::{
    collections::HashMap,
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
-    path::{Path, PathBuf, StripPrefixError},
+    path::{Path, PathBuf},
    pin::Pin,
    sync::Arc,
 };

 use anyhow::{bail, Context};

-use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -45,34 +44,12 @@ pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

-// From the S3 spec
-pub const MAX_KEYS_PER_DELETE: usize = 1000;
-
 /// Path on the remote storage, relative to some inner prefix.
 /// The prefix is an implementation detail, that allows representing local paths
 /// as the remote ones, stripping the local storage prefix away.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

-impl Serialize for RemotePath {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.collect_str(self)
-    }
-}
-
-impl<'de> Deserialize<'de> for RemotePath {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let str = String::deserialize(deserializer)?;
-        Ok(Self(PathBuf::from(&str)))
-    }
-}
-
 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0.display())
@@ -111,15 +88,6 @@ impl RemotePath {
    pub fn extension(&self) -> Option<&str> {
        self.0.extension()?.to_str()
    }
-
-    /// Unwrap the PathBuf that RemotePath wraps
-    pub fn take(self) -> PathBuf {
-        self.0
-    }
-
-    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, StripPrefixError> {
-        self.0.strip_prefix(&p.0)
-    }
 }

 /// Storage (potentially remote) API to manage its state.
@@ -198,8 +166,6 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
-    /// The client was shut down
-    Shutdown,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -211,7 +177,6 @@ impl std::fmt::Display for DownloadError {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Shutdown => write!(f, "Client shutting down"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
    }
@@ -276,18 +241,6 @@ impl GenericRemoteStorage {
        }
    }

-    /// For small, simple downloads where caller doesn't want to handle the streaming: return the full body
-    pub async fn download_all(&self, from: &RemotePath) -> Result<Vec<u8>, DownloadError> {
-        let mut download = self.download(from).await?;
-
-        let mut bytes = Vec::new();
-        tokio::io::copy(&mut download.download_stream, &mut bytes)
-            .await
-            .with_context(|| format!("Failed to download body from {from}"))
-            .map_err(DownloadError::Other)?;
-        Ok(bytes)
-    }
-
    pub async fn download_byte_range(
        &self,
        from: &RemotePath,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -155,18 +155,20 @@ impl RemoteStorage for LocalFs {
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
        match fs::metadata(full_path.clone()).await {
-            Err(e) => {
-                // It's not a file that exists: strip the prefix back to the parent directory
-                if matches!(e.kind(), ErrorKind::NotFound) {
-                    initial_dir.pop();
-                }
-            }
            Ok(meta) => {
                if !meta.is_dir() {
                    // It's not a directory: strip back to the parent
                    initial_dir.pop();
                }
            }
+            Err(e) if e.kind() == ErrorKind::NotFound => {
+                // It's not a file that exists: strip the prefix back to the parent directory
+                initial_dir.pop();
+            }
+            Err(e) => {
+                // Unexpected I/O error
+                anyhow::bail!(e)
+            }
        }

        // Note that PathBuf starts_with only considers full path segments, but
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -22,7 +22,7 @@ use aws_sdk_s3::{
    Client,
 };
 use aws_smithy_http::body::SdkBody;
-use hyper::{Body, StatusCode};
+use hyper::Body;
 use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
@@ -529,16 +529,7 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    if let Some(r) = e.raw_response() {
-                        if r.http().status() == StatusCode::NOT_FOUND {
-                            // 404 is acceptable for deletions.  AWS S3 does not return this, but
-                            // some other implementations might (e.g. GCS XML API returns 404 on DeleteObject
-                            // to a missing key)
-                            continue;
-                        } else {
-                            return Err(anyhow::format_err!("DeleteObjects response error: {e}"));
-                        }
-                    }
+                    return Err(e.into());
                }
            }
        }
@@ -582,7 +573,7 @@ mod tests {

    #[test]
    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -9,11 +9,12 @@ PORT=$4
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
 rm -fr "$DATA_DIR"
 env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
-echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
 cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
 cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
 for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -1,8 +1,13 @@
-use std::fmt::Display;
+use std::fmt::Debug;

 use serde::{Deserialize, Serialize};

-#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+/// Tenant generations are used to provide split-brain safety and allow
+/// multiple pageservers to attach the same tenant concurrently.
+///
+/// See docs/rfcs/025-generation-numbers.md for detail on how generation
+/// numbers are used.
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
@@ -48,6 +53,7 @@ impl Generation {
        matches!(self, Self::None)
    }

+    #[track_caller]
    pub fn get_suffix(&self) -> String {
        match self {
            Self::Valid(v) => {
@@ -60,19 +66,27 @@ impl Generation {
        }
    }

-    pub fn previous(&self) -> Self {
-        if let Self::Valid(v) = self {
-            Self::new(v - 1)
-        } else {
-            Self::none()
-        }
+    /// `suffix` is the part after "-" in a key
+    ///
+    /// Returns None if parsing was unsuccessful
+    pub fn parse_suffix(suffix: &str) -> Option<Generation> {
+        u32::from_str_radix(suffix, 16).map(Generation::new).ok()
    }

-    pub fn into(self) -> Option<u32> {
-        if let Self::Valid(v) = self {
-            Some(v)
-        } else {
-            None
+    #[track_caller]
+    pub fn previous(&self) -> Generation {
+        match self {
+            Self::Valid(n) => {
+                if *n == 0 {
+                    // Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation
+                    // to 0 as being "no generation".
+                    Self::None
+                } else {
+                    Self::Valid(n - 1)
+                }
+            }
+            Self::None => Self::None,
+            Self::Broken => panic!("Attempted to use a broken generation"),
        }
    }
 }
@@ -89,7 +103,7 @@ impl Serialize for Generation {
            // that include an optional generation should convert None to an
            // Option<Generation>::None
            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation",
+                "Tried to serialize invalid generation ({self})",
            ))
        }
    }
@@ -104,7 +118,10 @@ impl<'de> Deserialize<'de> for Generation {
    }
 }

-impl Display for Generation {
+// We intentionally do not implement Display for Generation, to reduce the
+// risk of a bug where the generation is used in a format!() string directly
+// instead of using get_suffix().
+impl Debug for Generation {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Valid(v) => {
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,9 +24,6 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

-    #[error("Shutting down")]
-    ShuttingDown,
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -55,10 +52,6 @@ impl ApiError {
                self.to_string(),
                StatusCode::PRECONDITION_FAILED,
            ),
-            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
-                "Shutting down".to_string(),
-                StatusCode::SERVICE_UNAVAILABLE,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -50,7 +50,7 @@ impl Id {
        Id::from(tli_buf)
    }

-    pub fn hex_encode(&self) -> String {
+    fn hex_encode(&self) -> String {
        static HEX: &[u8] = b"0123456789abcdef";

        let mut buf = vec![0u8; self.0.len() * 2];
@@ -133,10 +133,6 @@ macro_rules! id_newtype {
            pub const fn from_array(b: [u8; 16]) -> Self {
                $t(Id(b))
            }
-
-            pub fn hex_encode(&self) -> String {
-                self.0.hex_encode()
-            }
        }

        impl FromStr for $t {
@@ -248,13 +244,13 @@ id_newtype!(TenantId);
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
 /// See [`Id`] for alternative ways to serialize it.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
 pub struct ConnectionId(Id);

 id_newtype!(ConnectionId);

 // A pair uniquely identifying Neon instance.
-#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -277,36 +273,6 @@ impl TenantTimelineId {
    }
 }

-impl Serialize for TenantTimelineId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.collect_str(self)
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantTimelineId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let str = String::deserialize(deserializer)?;
-        if let Some((tenant_part, timeline_part)) = str.split_once('/') {
-            Ok(Self {
-                tenant_id: TenantId(Id::from_hex(tenant_part).map_err(|e| {
-                    serde::de::Error::custom(format!("Malformed tenant in TenantTimelineId: {e}"))
-                })?),
-                timeline_id: TimelineId(Id::from_hex(timeline_part).map_err(|e| {
-                    serde::de::Error::custom(format!("Malformed timeline in TenantTimelineId {e}"))
-                })?),
-            })
-        } else {
-            Err(serde::de::Error::custom("Malformed TenantTimelineId"))
-        }
-    }
-}
-
 impl fmt::Display for TenantTimelineId {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}/{}", self.tenant_id, self.timeline_id)
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -178,14 +178,17 @@ pub async fn ws_handler(

 /// Starts the monitor. If startup fails or the monitor exits, an error will
 /// be logged and our internal state will be reset to allow for new connections.
-#[tracing::instrument(skip_all, fields(?args))]
+#[tracing::instrument(skip_all)]
 async fn start_monitor(
    ws: WebSocket,
    args: &Args,
    kill: broadcast::Receiver<()>,
    token: CancellationToken,
 ) {
-    info!("accepted new websocket connection -> starting monitor");
+    info!(
+        ?args,
+        "accepted new websocket connection -> starting monitor"
+    );
    let timeout = Duration::from_secs(4);
    let monitor = tokio::time::timeout(
        timeout,
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,6 +5,7 @@
 //! all functionality.

 use std::sync::Arc;
+use std::time::{Duration, Instant};
 use std::{fmt::Debug, mem};

 use anyhow::{bail, Context};
@@ -36,6 +37,8 @@ pub struct Runner {
    /// by us vs the autoscaler-agent.
    counter: usize,

+    last_upscale_request_at: Option<Instant>,
+
    /// A signal to kill the main thread produced by `self.run()`. This is triggered
    /// when the server receives a new connection. When the thread receives the
    /// signal off this channel, it will gracefully shutdown.
@@ -99,6 +102,7 @@ impl Runner {
            cgroup: None,
            dispatcher,
            counter: 1, // NB: must be odd, see the comment about the field for more.
+            last_upscale_request_at: None,
            kill,
        };

@@ -397,6 +401,20 @@ impl Runner {
                    if request.is_none() {
                        bail!("failed to listen for upscale event from cgroup")
                    }
+
+                    // If it's been less than 1 second since the last time we requested upscaling,
+                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
+                    // ~1k times per second).
+                    if let Some(t) = self.last_upscale_request_at {
+                        let elapsed = t.elapsed();
+                        if elapsed < Duration::from_secs(1) {
+                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
+                            continue;
+                        }
+                    }
+
+                    self.last_upscale_request_at = Some(Instant::now());
+
                    info!("cgroup asking for upscale; forwarding request");
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,6 +3,7 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
+use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
@@ -96,7 +97,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
@@ -142,12 +143,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
    let mut total_excess_layers = 0usize;
-    for tenant in fs::read_dir(storage_path.join("tenants"))? {
+    for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? {
        let tenant = tenant?;
        if !tenant.file_type()?.is_dir() {
            continue;
        }
-        for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+        for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
            let timeline = timeline?;
            if !timeline.file_type()?.is_dir() {
                continue;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -5,6 +5,7 @@ use clap::Subcommand;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
    repository::{Key, KEY_SIZE},
@@ -47,7 +48,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
@@ -68,7 +69,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            },
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader_virtual(&file);
+    let cursor = BlockCursor::new_fileblockreader(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
@@ -80,13 +81,13 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
 pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
-            for tenant in fs::read_dir(path.join("tenants"))? {
+            for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
                let tenant = tenant?;
                if !tenant.file_type()?.is_dir() {
                    continue;
                }
                println!("tenant {}", tenant.file_name().to_string_lossy());
-                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+                for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
                    let timeline = timeline?;
                    if !timeline.file_type()?.is_dir() {
                        continue;
@@ -101,9 +102,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            timeline,
        } => {
            let timeline_path = path
-                .join("tenants")
+                .join(TENANTS_SEGMENT_NAME)
                .join(tenant)
-                .join("timelines")
+                .join(TIMELINES_SEGMENT_NAME)
                .join(timeline);
            let mut idx = 0;
            for layer in fs::read_dir(timeline_path)? {
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -25,6 +25,7 @@ use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

+use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
@@ -323,14 +324,25 @@ where
                .timeline
                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                .await?;
-            ensure!(img.len() == 512);
+
+            ensure!(
+                img.len()
+                    == dispatch_pgversion!(
+                        self.timeline.pg_version,
+                        pgv::bindings::SIZEOF_RELMAPFILE
+                    )
+            );
+
            Some(img)
        } else {
            None
        };

        if spcnode == GLOBALTABLESPACE_OID {
-            let pg_version_str = self.timeline.pg_version.to_string();
+            let pg_version_str = match self.timeline.pg_version {
+                14 | 15 => self.timeline.pg_version.to_string(),
+                ver => format!("{ver}\x0A"),
+            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
            self.ar.append(&header, pg_version_str.as_bytes()).await?;

@@ -374,7 +386,10 @@ where
            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);

-                let pg_version_str = self.timeline.pg_version.to_string();
+                let pg_version_str = match self.timeline.pg_version {
+                    14 | 15 => self.timeline.pg_version.to_string(),
+                    ver => format!("{ver}\x0A"),
+                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
                self.ar.append(&header, pg_version_str.as_bytes()).await?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,14 +2,12 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
-use std::time::Duration;
 use std::{env, ops::ControlFlow, path::Path, str::FromStr};

 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
-use pageserver::deletion_queue::{DeletionQueue, DeletionQueueError};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -351,35 +349,6 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

-    // Set up deletion queue
-    let deletion_queue_cancel = tokio_util::sync::CancellationToken::new();
-    let (deletion_queue, deletion_frontend, deletion_backend, deletion_executor) =
-        DeletionQueue::new(remote_storage.clone(), conf, deletion_queue_cancel.clone());
-    if let Some(mut deletion_frontend) = deletion_frontend {
-        BACKGROUND_RUNTIME.spawn(async move {
-            deletion_frontend
-                .background()
-                .instrument(info_span!(parent:None, "deletion frontend"))
-                .await
-        });
-    }
-    if let Some(mut deletion_backend) = deletion_backend {
-        BACKGROUND_RUNTIME.spawn(async move {
-            deletion_backend
-                .background()
-                .instrument(info_span!(parent: None, "deletion backend"))
-                .await
-        });
-    }
-    if let Some(mut deletion_executor) = deletion_executor {
-        BACKGROUND_RUNTIME.spawn(async move {
-            deletion_executor
-                .background()
-                .instrument(info_span!(parent: None, "deletion executor"))
-                .await
-        });
-    }
-
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint("initial", "Starting loading tenants");
@@ -417,9 +386,9 @@ fn start_pageserver(
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
-            deletion_queue_client: deletion_queue.new_client(),
        },
        order,
+        shutdown_pageserver.clone(),
    ))?;

    BACKGROUND_RUNTIME.spawn({
@@ -508,17 +477,19 @@ fn start_pageserver(
    {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

-        let router = http::make_router(
-            conf,
-            launch_ts,
-            http_auth,
-            broker_client.clone(),
-            remote_storage,
-            deletion_queue.clone(),
-            disk_usage_eviction_state,
-        )?
-        .build()
-        .map_err(|err| anyhow!(err))?;
+        let router_state = Arc::new(
+            http::routes::State::new(
+                conf,
+                http_auth.clone(),
+                remote_storage,
+                broker_client.clone(),
+                disk_usage_eviction_state,
+            )
+            .context("Failed to initialize router state")?,
+        );
+        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
+            .build()
+            .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
@@ -637,36 +608,6 @@ fn start_pageserver(
            // The plan is to change that over time.
            shutdown_pageserver.take();
            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
-
-            // Best effort to persist any outstanding deletions, to avoid leaking objects
-            let dq = deletion_queue.clone();
-            BACKGROUND_RUNTIME.block_on(async move {
-                match tokio::time::timeout(Duration::from_secs(5), dq.new_client().flush()).await {
-                    Ok(flush_r) => {
-                        match flush_r {
-                            Ok(()) => {
-                                info!("Deletion queue flushed successfully on shutdown")
-                            }
-                            Err(e) => {
-                                match e {
-                                    DeletionQueueError::ShuttingDown => {
-                                        // This is not harmful for correctness, but is unexpected: the deletion
-                                        // queue's workers should stay alive as long as there are any client handles instantiated.
-                                        warn!("Deletion queue stopped prematurely");
-                                    }
-                                }
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        warn!("Timed out flushing deletion queue on shutdown ({e})")
-                    }
-                }
-            });
-
-            // Clean shutdown of deletion queue workers
-            deletion_queue_cancel.cancel();
-
            unreachable!()
        }
    })
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -32,7 +32,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
+    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
@@ -72,7 +73,7 @@ pub mod defaults {
    /// Default built-in configuration file.
    ///
    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
-        r###"
+        r#"
 # Initial configuration file created by 'pageserver --init'
 #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
 #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
@@ -117,7 +118,7 @@ pub mod defaults {

 [remote_storage]

-"###
+"#
    );
 }

@@ -576,28 +577,7 @@ impl PageServerConf {
    //

    pub fn tenants_path(&self) -> PathBuf {
-        self.workdir.join("tenants")
-    }
-
-    pub fn deletion_prefix(&self) -> PathBuf {
-        self.workdir.join("deletion")
-    }
-
-    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
-        // Encode a version in the filename, so that if we ever switch away from JSON we can
-        // increment this.
-        const VERSION: u8 = 1;
-
-        self.deletion_prefix()
-            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
-    }
-
-    pub fn deletion_header_path(&self) -> PathBuf {
-        // Encode a version in the filename, so that if we ever switch away from JSON we can
-        // increment this.
-        const VERSION: u8 = 1;
-
-        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
+        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
@@ -688,26 +668,18 @@ impl PageServerConf {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 => Ok(path.join(format!("v{pg_version}"))),
-            15 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    /// Parse a configuration file (pageserver.toml) into a PageServerConf struct,
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -0,0 +1,119 @@
+use std::collections::HashMap;
+
+use hyper::StatusCode;
+use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{
+    backoff,
+    generation::Generation,
+    id::{NodeId, TenantId},
+};
+
+use crate::config::PageServerConf;
+
+// Backoffs when control plane requests do not succeed: compromise between reducing load
+// on control plane, and retrying frequently when we are blocked on a control plane
+// response to make progress.
+const BACKOFF_INCREMENT: f64 = 0.1;
+const BACKOFF_MAX: f64 = 10.0;
+
+/// The Pageserver's client for using the control plane API: this is a small subset
+/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
+pub(crate) struct ControlPlaneClient {
+    http_client: reqwest::Client,
+    base_url: Url,
+    node_id: NodeId,
+    cancel: CancellationToken,
+}
+
+impl ControlPlaneClient {
+    /// A None return value indicates that the input `conf` object does not have control
+    /// plane API enabled.
+    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+        let mut url = match conf.control_plane_api.as_ref() {
+            Some(u) => u.clone(),
+            None => return None,
+        };
+
+        if let Ok(mut segs) = url.path_segments_mut() {
+            // This ensures that `url` ends with a slash if it doesn't already.
+            // That way, we can subsequently use join() to safely attach extra path elements.
+            segs.pop_if_empty().push("");
+        }
+
+        let client = reqwest::ClientBuilder::new()
+            .build()
+            .expect("Failed to construct http client");
+
+        Some(Self {
+            http_client: client,
+            base_url: url,
+            node_id: conf.id,
+            cancel: cancel.clone(),
+        })
+    }
+
+    async fn try_re_attach(
+        &self,
+        url: Url,
+        request: &ReAttachRequest,
+    ) -> anyhow::Result<ReAttachResponse> {
+        match self.http_client.post(url).json(request).send().await {
+            Err(e) => Err(anyhow::Error::from(e)),
+            Ok(r) => {
+                if r.status() == StatusCode::OK {
+                    r.json::<ReAttachResponse>()
+                        .await
+                        .map_err(anyhow::Error::from)
+                } else {
+                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
+                }
+            }
+        }
+    }
+
+    /// Block until we get a successful response
+    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
+        let re_attach_path = self
+            .base_url
+            .join("re-attach")
+            .expect("Failed to build re-attach path");
+        let request = ReAttachRequest {
+            node_id: self.node_id,
+        };
+
+        let mut attempt = 0;
+        loop {
+            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
+            match result {
+                Ok(res) => {
+                    tracing::info!(
+                        "Received re-attach response with {} tenants",
+                        res.tenants.len()
+                    );
+
+                    return Ok(res
+                        .tenants
+                        .into_iter()
+                        .map(|t| (t.id, Generation::new(t.generation)))
+                        .collect::<HashMap<_, _>>());
+                }
+                Err(e) => {
+                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
+                    backoff::exponential_backoff(
+                        attempt,
+                        BACKOFF_INCREMENT,
+                        BACKOFF_MAX,
+                        &self.cancel,
+                    )
+                    .await;
+                    if self.cancel.is_cancelled() {
+                        return Err(anyhow::anyhow!("Shutting down"));
+                    }
+                    attempt += 1;
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -1,850 +0,0 @@
-mod backend;
-mod executor;
-mod frontend;
-
-use std::collections::HashMap;
-use std::path::PathBuf;
-
-use crate::metrics::DELETION_QUEUE_SUBMITTED;
-use crate::tenant::remote_timeline_client::remote_timeline_path;
-use remote_storage::{GenericRemoteStorage, RemotePath};
-use serde::Deserialize;
-use serde::Serialize;
-use serde_with::serde_as;
-use thiserror::Error;
-use tokio;
-use tokio_util::sync::CancellationToken;
-use tracing::{self, debug, error};
-use utils::generation::Generation;
-use utils::id::{TenantId, TimelineId};
-
-pub(crate) use self::backend::BackendQueueWorker;
-use self::executor::ExecutorWorker;
-use self::frontend::DeletionOp;
-pub(crate) use self::frontend::FrontendQueueWorker;
-use backend::BackendQueueMessage;
-use executor::ExecutorMessage;
-use frontend::FrontendQueueMessage;
-
-use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
-
-// TODO: adminstrative "panic button" config property to disable all deletions
-// TODO: configurable for how long to wait before executing deletions
-
-/// We aggregate object deletions from many tenants in one place, for several reasons:
-/// - Coalesce deletions into fewer DeleteObjects calls
-/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
-///   to flush any outstanding deletions.
-/// - Globally control throughput of deletions, as these are a low priority task: do
-///   not compete with the same S3 clients/connections used for higher priority uploads.
-/// - Future: enable validating that we may do deletions in a multi-attached scenario,
-///   via generation numbers (see https://github.com/neondatabase/neon/pull/4919)
-///
-/// There are two kinds of deletion: deferred and immediate.  A deferred deletion
-/// may be intentionally delayed to protect passive readers of S3 data, and may
-/// be subject to a generation number validation step.  An immediate deletion is
-/// ready to execute immediately, and is only queued up so that it can be coalesced
-/// with other deletions in flight.
-///
-/// Deferred deletions pass through three steps:
-/// - Frontend: accumulate deletion requests from Timelines, and batch them up into
-///   DeletionLists, which are persisted to S3.
-/// - Backend: accumulate deletion lists, and validate them en-masse prior to passing
-///   the keys in the list onward for actual deletion
-/// - Executor: accumulate object keys that the backend has validated for immediate
-///   deletion, and execute them in batches of 1000 keys via DeleteObjects.
-///
-/// Non-deferred deletions, such as during timeline deletion, bypass the first
-/// two stages and are passed straight into the Executor.
-///
-/// Internally, each stage is joined by a channel to the next.  In S3, there is only
-/// one queue (of DeletionLists), which is written by the frontend and consumed
-/// by the backend.
-#[derive(Clone)]
-pub struct DeletionQueue {
-    client: DeletionQueueClient,
-}
-
-#[derive(Debug)]
-struct FlushOp {
-    tx: tokio::sync::oneshot::Sender<()>,
-}
-
-impl FlushOp {
-    fn fire(self) {
-        if self.tx.send(()).is_err() {
-            // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
-            debug!("deletion queue flush from dropped client");
-        };
-    }
-}
-
-#[derive(Clone)]
-pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
-    executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct TenantDeletionList {
-    /// For each Timeline, a list of key fragments to append to the timeline remote path
-    /// when reconstructing a full key
-    timelines: HashMap<TimelineId, Vec<String>>,
-
-    /// The generation in which this deletion was emitted: note that this may not be the
-    /// same as the generation of any layers being deleted.  The generation of the layer
-    /// has already been absorbed into the keys in `objects`
-    generation: Generation,
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize)]
-struct DeletionList {
-    /// Serialization version, for future use
-    version: u8,
-
-    /// Used for constructing a unique key for each deletion list we write out.
-    sequence: u64,
-
-    /// To avoid repeating tenant/timeline IDs in every key, we store keys in
-    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
-    /// with one unique generation ID: if someone tries to push a second generation
-    /// ID for the same tenant, we will start a new DeletionList.
-    tenants: HashMap<TenantId, TenantDeletionList>,
-
-    /// Avoid having to walk `tenants` to calculate size
-    size: usize,
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize)]
-struct DeletionHeader {
-    /// Serialization version, for future use
-    version: u8,
-
-    /// Enable determining the next sequence number even if there are no deletion lists present.
-    /// If there _are_ deletion lists present, then their sequence numbers take precedence over
-    /// this.
-    last_deleted_list_seq: u64,
-    // TODO: this is where we will track a 'clean' sequence number that indicates all deletion
-    // lists <= that sequence have had their generations validated with the control plane
-    // and are OK to execute.
-}
-
-impl DeletionHeader {
-    const VERSION_LATEST: u8 = 1;
-
-    fn new(last_deleted_list_seq: u64) -> Self {
-        Self {
-            version: Self::VERSION_LATEST,
-            last_deleted_list_seq,
-        }
-    }
-}
-
-impl DeletionList {
-    const VERSION_LATEST: u8 = 1;
-    fn new(sequence: u64) -> Self {
-        Self {
-            version: Self::VERSION_LATEST,
-            sequence,
-            tenants: HashMap::new(),
-            size: 0,
-        }
-    }
-
-    fn drain(&mut self) -> Self {
-        let mut tenants = HashMap::new();
-        std::mem::swap(&mut self.tenants, &mut tenants);
-        let other = Self {
-            version: Self::VERSION_LATEST,
-            sequence: self.sequence,
-            tenants,
-            size: self.size,
-        };
-        self.size = 0;
-        other
-    }
-
-    fn is_empty(&self) -> bool {
-        self.tenants.is_empty()
-    }
-
-    fn len(&self) -> usize {
-        self.size
-    }
-
-    /// Returns true if the push was accepted, false if the caller must start a new
-    /// deletion list.
-    fn push(
-        &mut self,
-        tenant: &TenantId,
-        timeline: &TimelineId,
-        generation: Generation,
-        objects: &mut Vec<RemotePath>,
-    ) -> bool {
-        if objects.is_empty() {
-            // Avoid inserting an empty TimelineDeletionList: this preserves the property
-            // that if we have no keys, then self.objects is empty (used in Self::is_empty)
-            return true;
-        }
-
-        let tenant_entry = self
-            .tenants
-            .entry(*tenant)
-            .or_insert_with(|| TenantDeletionList {
-                timelines: HashMap::new(),
-                generation: generation,
-            });
-
-        if tenant_entry.generation != generation {
-            // Only one generation per tenant per list: signal to
-            // caller to start a new list.
-            return false;
-        }
-
-        let timeline_entry = tenant_entry
-            .timelines
-            .entry(*timeline)
-            .or_insert_with(|| Vec::new());
-
-        let timeline_remote_path = remote_timeline_path(tenant, timeline);
-
-        self.size += objects.len();
-        timeline_entry.extend(objects.drain(..).map(|p| {
-            p.strip_prefix(&timeline_remote_path)
-                .expect("Timeline paths always start with the timeline prefix")
-                .to_string_lossy()
-                .to_string()
-        }));
-        true
-    }
-
-    fn take_paths(self) -> Vec<RemotePath> {
-        let mut result = Vec::new();
-        for (tenant, tenant_deletions) in self.tenants.into_iter() {
-            for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
-                let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
-                result.extend(
-                    timeline_layers
-                        .into_iter()
-                        .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
-                );
-            }
-        }
-
-        result
-    }
-}
-
-#[derive(Error, Debug)]
-pub enum DeletionQueueError {
-    #[error("Deletion queue unavailable during shutdown")]
-    ShuttingDown,
-}
-
-impl DeletionQueueClient {
-    async fn do_push(&self, msg: FrontendQueueMessage) -> Result<(), DeletionQueueError> {
-        match self.tx.send(msg).await {
-            Ok(_) => Ok(()),
-            Err(e) => {
-                // This shouldn't happen, we should shut down all tenants before
-                // we shut down the global delete queue.  If we encounter a bug like this,
-                // we may leak objects as deletions won't be processed.
-                error!("Deletion queue closed while pushing, shutting down? ({e})");
-                Err(DeletionQueueError::ShuttingDown)
-            }
-        }
-    }
-
-    /// Submit a list of layers for deletion: this function will return before the deletion is
-    /// persistent, but it may be executed at any time after this function enters: do not push
-    /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
-    /// references them).
-    pub(crate) async fn push_layers(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
-    ) -> Result<(), DeletionQueueError> {
-        DELETION_QUEUE_SUBMITTED.inc_by(layers.len() as u64);
-        self.do_push(FrontendQueueMessage::Delete(DeletionOp {
-            tenant_id,
-            timeline_id,
-            layers,
-            generation,
-            objects: Vec::new(),
-        }))
-        .await
-    }
-
-    async fn do_flush(
-        &self,
-        msg: FrontendQueueMessage,
-        rx: tokio::sync::oneshot::Receiver<()>,
-    ) -> Result<(), DeletionQueueError> {
-        self.do_push(msg).await?;
-        if rx.await.is_err() {
-            // This shouldn't happen if tenants are shut down before deletion queue.  If we
-            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
-            // when it hasn't, possibly leading to leaking objects.
-            error!("Deletion queue dropped flush op while client was still waiting");
-            Err(DeletionQueueError::ShuttingDown)
-        } else {
-            Ok(())
-        }
-    }
-
-    /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
-    pub async fn flush(&self) -> Result<(), DeletionQueueError> {
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        self.do_flush(FrontendQueueMessage::Flush(FlushOp { tx }), rx)
-            .await
-    }
-
-    // Wait until all previous deletions are executed
-    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
-        debug!("flush_execute: flushing to deletion lists...");
-        // Flush any buffered work to deletion lists
-        self.flush().await?;
-
-        // Flush execution of deletion lists
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        debug!("flush_execute: flushing execution...");
-        self.do_flush(FrontendQueueMessage::FlushExecute(FlushOp { tx }), rx)
-            .await?;
-        debug!("flush_execute: finished flushing execution...");
-        Ok(())
-    }
-
-    /// This interface bypasses the persistent deletion queue, and any validation
-    /// that this pageserver is still elegible to execute the deletions.  It is for
-    /// use in timeline deletions, where the control plane is telling us we may
-    /// delete everything in the timeline.
-    ///
-    /// DO NOT USE THIS FROM GC OR COMPACTION CODE.  Use the regular `push_layers`.
-    pub(crate) async fn push_immediate(
-        &self,
-        objects: Vec<RemotePath>,
-    ) -> Result<(), DeletionQueueError> {
-        self.executor_tx
-            .send(ExecutorMessage::Delete(objects))
-            .await
-            .map_err(|_| DeletionQueueError::ShuttingDown)
-    }
-
-    /// Companion to push_immediate.  When this returns Ok, all prior objects sent
-    /// into push_immediate have been deleted from remote storage.
-    pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        self.executor_tx
-            .send(ExecutorMessage::Flush(FlushOp { tx }))
-            .await
-            .map_err(|_| DeletionQueueError::ShuttingDown)?;
-
-        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
-    }
-}
-
-impl DeletionQueue {
-    pub fn new_client(&self) -> DeletionQueueClient {
-        self.client.clone()
-    }
-
-    /// Caller may use the returned object to construct clients with new_client.
-    /// Caller should tokio::spawn the background() members of the two worker objects returned:
-    /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
-    ///
-    /// If remote_storage is None, then the returned workers will also be None.
-    pub fn new(
-        remote_storage: Option<GenericRemoteStorage>,
-        conf: &'static PageServerConf,
-        cancel: CancellationToken,
-    ) -> (
-        Self,
-        Option<FrontendQueueWorker>,
-        Option<BackendQueueWorker>,
-        Option<ExecutorWorker>,
-    ) {
-        // Deep channel: it consumes deletions from all timelines and we do not want to block them
-        let (tx, rx) = tokio::sync::mpsc::channel(16384);
-
-        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
-        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
-
-        // Shallow channel: it carries lists of paths, and we expect the main queueing to
-        // happen in the backend (persistent), not in this queue.
-        let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
-
-        let remote_storage = match remote_storage {
-            None => {
-                return (
-                    Self {
-                        client: DeletionQueueClient { tx, executor_tx },
-                    },
-                    None,
-                    None,
-                    None,
-                )
-            }
-            Some(r) => r,
-        };
-
-        (
-            Self {
-                client: DeletionQueueClient {
-                    tx,
-                    executor_tx: executor_tx.clone(),
-                },
-            },
-            Some(FrontendQueueWorker::new(
-                conf,
-                rx,
-                backend_tx,
-                cancel.clone(),
-            )),
-            Some(BackendQueueWorker::new(
-                conf,
-                backend_rx,
-                executor_tx,
-                cancel.clone(),
-            )),
-            Some(ExecutorWorker::new(
-                remote_storage,
-                executor_rx,
-                cancel.clone(),
-            )),
-        )
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use hex_literal::hex;
-    use std::{
-        io::ErrorKind,
-        path::{Path, PathBuf},
-    };
-    use tracing::info;
-
-    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use tokio::{runtime::EnterGuard, task::JoinHandle};
-
-    use crate::tenant::{harness::TenantHarness, remote_timeline_client::remote_timeline_path};
-
-    use super::*;
-    pub const TIMELINE_ID: TimelineId =
-        TimelineId::from_array(hex!("11223344556677881122334455667788"));
-
-    struct TestSetup {
-        runtime: &'static tokio::runtime::Runtime,
-        _entered_runtime: EnterGuard<'static>,
-        harness: TenantHarness,
-        remote_fs_dir: PathBuf,
-        storage: GenericRemoteStorage,
-        deletion_queue: DeletionQueue,
-        fe_worker: JoinHandle<()>,
-        be_worker: JoinHandle<()>,
-        ex_worker: JoinHandle<()>,
-    }
-
-    impl TestSetup {
-        /// Simulate a pageserver restart by destroying and recreating the deletion queue
-        fn restart(&mut self) {
-            let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
-                Some(self.storage.clone()),
-                self.harness.conf,
-                CancellationToken::new(),
-            );
-
-            self.deletion_queue = deletion_queue;
-
-            let mut fe_worker = fe_worker.unwrap();
-            let mut be_worker = be_worker.unwrap();
-            let mut ex_worker = ex_worker.unwrap();
-            let mut fe_worker = self
-                .runtime
-                .spawn(async move { fe_worker.background().await });
-            let mut be_worker = self
-                .runtime
-                .spawn(async move { be_worker.background().await });
-            let mut ex_worker = self.runtime.spawn(async move {
-                drop(ex_worker.background().await);
-            });
-            std::mem::swap(&mut self.fe_worker, &mut fe_worker);
-            std::mem::swap(&mut self.be_worker, &mut be_worker);
-            std::mem::swap(&mut self.ex_worker, &mut ex_worker);
-
-            // Join the old workers
-            self.runtime.block_on(fe_worker).unwrap();
-            self.runtime.block_on(be_worker).unwrap();
-            self.runtime.block_on(ex_worker).unwrap();
-        }
-    }
-
-    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
-        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name)?;
-
-        // We do not load() the harness: we only need its config and remote_storage
-
-        // Set up a GenericRemoteStorage targetting a directory
-        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-        };
-        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
-
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let entered_runtime = runtime.enter();
-
-        let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
-            Some(storage.clone()),
-            harness.conf,
-            CancellationToken::new(),
-        );
-
-        let mut fe_worker = fe_worker.unwrap();
-        let mut be_worker = be_worker.unwrap();
-        let mut ex_worker = ex_worker.unwrap();
-        let fe_worker_join = runtime.spawn(async move { fe_worker.background().await });
-        let be_worker_join = runtime.spawn(async move { be_worker.background().await });
-        let ex_worker_join = runtime.spawn(async move {
-            drop(ex_worker.background().await);
-        });
-
-        Ok(TestSetup {
-            runtime,
-            _entered_runtime: entered_runtime,
-            harness,
-            remote_fs_dir,
-            storage,
-            deletion_queue,
-            fe_worker: fe_worker_join,
-            be_worker: be_worker_join,
-            ex_worker: ex_worker_join,
-        })
-    }
-
-    // TODO: put this in a common location so that we can share with remote_timeline_client's tests
-    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
-        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
-        expected.sort();
-
-        let mut found: Vec<String> = Vec::new();
-        let dir = match std::fs::read_dir(remote_path) {
-            Ok(d) => d,
-            Err(e) => {
-                if e.kind() == ErrorKind::NotFound {
-                    if expected.is_empty() {
-                        // We are asserting prefix is empty: it is expected that the dir is missing
-                        return;
-                    } else {
-                        assert_eq!(expected, Vec::<String>::new());
-                        unreachable!();
-                    }
-                } else {
-                    panic!(
-                        "Unexpected error listing {0}: {e}",
-                        remote_path.to_string_lossy()
-                    );
-                }
-            }
-        };
-
-        for entry in dir.flatten() {
-            let entry_name = entry.file_name();
-            let fname = entry_name.to_str().unwrap();
-            found.push(String::from(fname));
-        }
-        found.sort();
-
-        assert_eq!(expected, found);
-    }
-
-    fn assert_local_files(expected: &[&str], directory: &Path) {
-        let mut dir = match std::fs::read_dir(directory) {
-            Ok(d) => d,
-            Err(_) => {
-                assert_eq!(expected, &Vec::<String>::new());
-                return;
-            }
-        };
-        let mut found = Vec::new();
-        while let Some(dentry) = dir.next() {
-            let dentry = dentry.unwrap();
-            let file_name = dentry.file_name();
-            let file_name_str = file_name.to_string_lossy();
-            found.push(file_name_str.to_string());
-        }
-        found.sort();
-        assert_eq!(expected, found);
-    }
-
-    #[test]
-    fn deletion_queue_smoke() -> anyhow::Result<()> {
-        // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
-        let client = ctx.deletion_queue.new_client();
-
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let tenant_id = ctx.harness.tenant_id;
-
-        let content: Vec<u8> = "victim1 contents".into();
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
-        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
-        let deletion_prefix = ctx.harness.conf.deletion_prefix();
-
-        // Exercise the distinction between the generation of the layers
-        // we delete, and the generation of the running Tenant.
-        let layer_generation = Generation::new(0xdeadbeef);
-        let now_generation = Generation::new(0xfeedbeef);
-
-        let remote_layer_file_name_1 =
-            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
-
-        // Inject a victim file to remote storage
-        info!("Writing");
-        std::fs::create_dir_all(&remote_timeline_path)?;
-        std::fs::write(
-            remote_timeline_path.join(remote_layer_file_name_1.clone()),
-            content,
-        )?;
-        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
-
-        // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
-        info!("Pushing");
-        ctx.runtime.block_on(client.push_layers(
-            tenant_id,
-            TIMELINE_ID,
-            now_generation,
-            [(layer_file_name_1.clone(), layer_generation)].to_vec(),
-        ))?;
-        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
-
-        assert_local_files(&[], &deletion_prefix);
-
-        // File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
-        info!("Flushing");
-        ctx.runtime.block_on(client.flush())?;
-        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
-        assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
-
-        // File should go away when we execute
-        info!("Flush-executing");
-        ctx.runtime.block_on(client.flush_execute())?;
-        assert_remote_files(&[], &remote_timeline_path);
-        assert_local_files(&["header-01"], &deletion_prefix);
-
-        // Flushing on an empty queue should succeed immediately, and not write any lists
-        info!("Flush-executing on empty");
-        ctx.runtime.block_on(client.flush_execute())?;
-        assert_local_files(&["header-01"], &deletion_prefix);
-
-        Ok(())
-    }
-
-    #[test]
-    fn deletion_queue_recovery() -> anyhow::Result<()> {
-        // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
-        let client = ctx.deletion_queue.new_client();
-
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let tenant_id = ctx.harness.tenant_id;
-
-        let content: Vec<u8> = "victim1 contents".into();
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
-        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
-        let deletion_prefix = ctx.harness.conf.deletion_prefix();
-        let layer_generation = Generation::new(0xdeadbeef);
-        let now_generation = Generation::new(0xfeedbeef);
-        let remote_layer_file_name_1 =
-            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
-
-        // Inject a file, delete it, and flush to a deletion list
-        std::fs::create_dir_all(&remote_timeline_path)?;
-        std::fs::write(
-            remote_timeline_path.join(remote_layer_file_name_1.clone()),
-            content,
-        )?;
-        ctx.runtime.block_on(client.push_layers(
-            tenant_id,
-            TIMELINE_ID,
-            now_generation,
-            [(layer_file_name_1.clone(), layer_generation)].to_vec(),
-        ))?;
-        ctx.runtime.block_on(client.flush())?;
-        assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
-
-        // Restart the deletion queue
-        drop(client);
-        ctx.restart();
-        let client = ctx.deletion_queue.new_client();
-
-        // If we have recovered the deletion list properly, then executing after restart should purge it
-        info!("Flush-executing");
-        ctx.runtime.block_on(client.flush_execute())?;
-        assert_remote_files(&[], &remote_timeline_path);
-        assert_local_files(&["header-01"], &deletion_prefix);
-        Ok(())
-    }
-}
-
-/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
-/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
-#[cfg(test)]
-pub mod mock {
-    use tracing::info;
-
-    use crate::tenant::remote_timeline_client::remote_layer_path;
-
-    use super::*;
-    use std::sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    };
-
-    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
-        executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-        tx_pump: tokio::sync::mpsc::Sender<FlushOp>,
-        executed: Arc<AtomicUsize>,
-    }
-
-    impl MockDeletionQueue {
-        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, mut rx) = tokio::sync::mpsc::channel(16384);
-            let (tx_pump, mut rx_pump) = tokio::sync::mpsc::channel::<FlushOp>(1);
-            let (executor_tx, mut executor_rx) = tokio::sync::mpsc::channel(16384);
-
-            let executed = Arc::new(AtomicUsize::new(0));
-            let executed_bg = executed.clone();
-
-            tokio::spawn(async move {
-                let remote_storage = match &remote_storage {
-                    Some(rs) => rs,
-                    None => {
-                        info!("No remote storage configured, deletion queue will not run");
-                        return;
-                    }
-                };
-                info!("Running mock deletion queue");
-                // Each time we are asked to pump, drain the queue of deletions
-                while let Some(flush_op) = rx_pump.recv().await {
-                    info!("Executing all pending deletions");
-
-                    // Transform all executor messages to generic frontend messages
-                    while let Ok(msg) = executor_rx.try_recv() {
-                        match msg {
-                            ExecutorMessage::Delete(objects) => {
-                                for path in objects {
-                                    match remote_storage.delete(&path).await {
-                                        Ok(_) => {
-                                            debug!("Deleted {path}");
-                                        }
-                                        Err(e) => {
-                                            error!(
-                                                "Failed to delete {path}, leaking object! ({e})"
-                                            );
-                                        }
-                                    }
-                                    executed_bg.fetch_add(1, Ordering::Relaxed);
-                                }
-                            }
-                            ExecutorMessage::Flush(flush_op) => {
-                                flush_op.fire();
-                            }
-                        }
-                    }
-
-                    while let Ok(msg) = rx.try_recv() {
-                        match msg {
-                            FrontendQueueMessage::Delete(op) => {
-                                let mut objects = op.objects;
-                                for (layer, generation) in op.layers {
-                                    objects.push(remote_layer_path(
-                                        &op.tenant_id,
-                                        &op.timeline_id,
-                                        &layer,
-                                        generation,
-                                    ));
-                                }
-
-                                for path in objects {
-                                    info!("Executing deletion {path}");
-                                    match remote_storage.delete(&path).await {
-                                        Ok(_) => {
-                                            debug!("Deleted {path}");
-                                        }
-                                        Err(e) => {
-                                            error!(
-                                                "Failed to delete {path}, leaking object! ({e})"
-                                            );
-                                        }
-                                    }
-                                    executed_bg.fetch_add(1, Ordering::Relaxed);
-                                }
-                            }
-                            FrontendQueueMessage::Flush(op) => {
-                                op.fire();
-                            }
-                            FrontendQueueMessage::FlushExecute(op) => {
-                                // We have already executed all prior deletions because mock does them inline
-                                op.fire();
-                            }
-                        }
-                        info!("All pending deletions have been executed");
-                    }
-                    flush_op
-                        .tx
-                        .send(())
-                        .expect("Test called flush but dropped before finishing");
-                }
-            });
-
-            Self {
-                tx,
-                tx_pump,
-                executor_tx,
-                executed,
-            }
-        }
-
-        pub fn get_executed(&self) -> usize {
-            self.executed.load(Ordering::Relaxed)
-        }
-
-        pub async fn pump(&self) {
-            let (tx, rx) = tokio::sync::oneshot::channel();
-            self.tx_pump
-                .send(FlushOp { tx })
-                .await
-                .expect("pump called after deletion queue loop stopped");
-            rx.await
-                .expect("Mock delete queue shutdown while waiting to pump");
-        }
-
-        pub(crate) fn new_client(&self) -> DeletionQueueClient {
-            DeletionQueueClient {
-                tx: self.tx.clone(),
-                executor_tx: self.executor_tx.clone(),
-            }
-        }
-    }
-}
--- a/pageserver/src/deletion_queue/backend.rs
+++ b/pageserver/src/deletion_queue/backend.rs
@@ -1,300 +0,0 @@
-use std::collections::HashMap;
-use std::time::Duration;
-
-use futures::future::TryFutureExt;
-use pageserver_api::control_api::HexTenantId;
-use pageserver_api::control_api::{ValidateRequest, ValidateRequestTenant, ValidateResponse};
-use serde::de::DeserializeOwned;
-use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
-use utils::backoff;
-
-use crate::config::PageServerConf;
-use crate::metrics::DELETION_QUEUE_ERRORS;
-
-use super::executor::ExecutorMessage;
-use super::DeletionHeader;
-use super::DeletionList;
-use super::DeletionQueueError;
-use super::FlushOp;
-
-// After this length of time, execute deletions which are elegible to run,
-// even if we haven't accumulated enough for a full-sized DeleteObjects
-const EXECUTE_IDLE_DEADLINE: Duration = Duration::from_secs(60);
-
-// If we have received this number of keys, proceed with attempting to execute
-const AUTOFLUSH_KEY_COUNT: usize = 16384;
-
-#[derive(Debug)]
-pub(super) enum BackendQueueMessage {
-    Delete(DeletionList),
-    Flush(FlushOp),
-}
-pub struct BackendQueueWorker {
-    conf: &'static PageServerConf,
-    rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
-    tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-
-    // Accumulate some lists to execute in a batch.
-    // The purpose of this accumulation is to implement batched validation of
-    // attachment generations, when split-brain protection is implemented.
-    // (see https://github.com/neondatabase/neon/pull/4919)
-    pending_lists: Vec<DeletionList>,
-
-    // Sum of all the lengths of lists in pending_lists
-    pending_key_count: usize,
-
-    // DeletionLists we have fully executed, which may be deleted
-    // from remote storage.
-    executed_lists: Vec<DeletionList>,
-
-    cancel: CancellationToken,
-}
-
-#[derive(thiserror::Error, Debug)]
-enum ValidateCallError {
-    #[error("shutdown")]
-    Shutdown,
-    #[error("remote: {0}")]
-    Remote(reqwest::Error),
-}
-
-async fn retry_http_forever<T>(
-    url: &url::Url,
-    request: ValidateRequest,
-    cancel: CancellationToken,
-) -> Result<T, DeletionQueueError>
-where
-    T: DeserializeOwned,
-{
-    let client = reqwest::ClientBuilder::new()
-        .build()
-        .expect("Failed to construct http client");
-
-    let response = match backoff::retry(
-        || {
-            client
-                .post(url.clone())
-                .json(&request)
-                .send()
-                .map_err(|e| ValidateCallError::Remote(e))
-        },
-        |_| false,
-        3,
-        u32::MAX,
-        "calling control plane generation validation API",
-        backoff::Cancel::new(cancel.clone(), || ValidateCallError::Shutdown),
-    )
-    .await
-    {
-        Err(ValidateCallError::Shutdown) => {
-            return Err(DeletionQueueError::ShuttingDown);
-        }
-        Err(ValidateCallError::Remote(_)) => {
-            panic!("We retry forever");
-        }
-        Ok(r) => r,
-    };
-
-    // TODO: handle non-200 response
-    // TODO: handle decode error
-    Ok(response.json::<T>().await.unwrap())
-}
-
-impl BackendQueueWorker {
-    pub(super) fn new(
-        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
-        tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            conf,
-            rx,
-            tx,
-            pending_lists: Vec::new(),
-            pending_key_count: 0,
-            executed_lists: Vec::new(),
-            cancel,
-        }
-    }
-
-    async fn cleanup_lists(&mut self) {
-        debug!(
-            "cleanup_lists: {0} executed lists, {1} pending lists",
-            self.executed_lists.len(),
-            self.pending_lists.len()
-        );
-
-        // Lists are always pushed into the queues + executed list in sequence order, so
-        // no sort is required: can find the highest sequence number by peeking at last element
-        let max_executed_seq = match self.executed_lists.last() {
-            Some(v) => v.sequence,
-            None => {
-                // No executed lists, nothing to clean up.
-                return;
-            }
-        };
-
-        // In case this is the last list, write a header out first so that
-        // we don't risk losing our knowledge of the sequence number (on replay, our
-        // next sequence number is the highest list seen + 1, or read from the header
-        // if there are no lists)
-        let header = DeletionHeader::new(max_executed_seq);
-        debug!("Writing header {:?}", header);
-        let header_bytes =
-            serde_json::to_vec(&header).expect("Failed to serialize deletion header");
-        let header_path = self.conf.deletion_header_path();
-
-        if let Err(e) = tokio::fs::write(&header_path, header_bytes).await {
-            warn!("Failed to upload deletion queue header: {e:#}");
-            DELETION_QUEUE_ERRORS
-                .with_label_values(&["put_header"])
-                .inc();
-            return;
-        }
-
-        while let Some(list) = self.executed_lists.pop() {
-            let list_path = self.conf.deletion_list_path(list.sequence);
-            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                // Unexpected: we should have permissions and nothing else should
-                // be touching these files
-                tracing::error!("Failed to delete {0}: {e:#}", list_path.display());
-                self.executed_lists.push(list);
-                break;
-            }
-        }
-    }
-
-    pub async fn validate_lists(&mut self) -> Result<(), DeletionQueueError> {
-        let control_plane_api = match &self.conf.control_plane_api {
-            None => {
-                // Generations are not switched on yet.
-                return Ok(());
-            }
-            Some(api) => api,
-        };
-
-        let validate_path = control_plane_api
-            .join("validate")
-            .expect("Failed to build validate path");
-
-        for list in &mut self.pending_lists {
-            let request = ValidateRequest {
-                tenants: list
-                    .tenants
-                    .iter()
-                    .map(|(tid, tdl)| ValidateRequestTenant {
-                        id: HexTenantId::new(*tid),
-                        gen: tdl.generation.into().expect(
-                            "Generation should always be valid for a Tenant doing deletions",
-                        ),
-                    })
-                    .collect(),
-            };
-
-            // Retry forever, we cannot make progress until we get a response
-            let response: ValidateResponse =
-                retry_http_forever(&validate_path, request, self.cancel.clone()).await?;
-
-            let tenants_valid: HashMap<_, _> = response
-                .tenants
-                .into_iter()
-                .map(|t| (t.id.take(), t.valid))
-                .collect();
-
-            // Filter the list based on whether the server responded valid: true.
-            // If a tenant is omitted in the response, it has been deleted, and we should
-            // proceed with deletion.
-            list.tenants.retain(|tenant_id, _tenant| {
-                let r = tenants_valid.get(tenant_id).map(|v| *v).unwrap_or(true);
-                if !r {
-                    warn!("Dropping stale deletions for tenant {tenant_id}, objects may be leaked");
-                }
-                r
-            });
-        }
-
-        Ok(())
-    }
-
-    pub async fn flush(&mut self) {
-        // Issue any required generation validation calls to the control plane
-        if let Err(DeletionQueueError::ShuttingDown) = self.validate_lists().await {
-            warn!("Shutting down");
-            return;
-        }
-
-        // Submit all keys from pending DeletionLists into the executor
-        for list in self.pending_lists.drain(..) {
-            let objects = list.take_paths();
-            if let Err(_e) = self.tx.send(ExecutorMessage::Delete(objects)).await {
-                warn!("Shutting down");
-                return;
-            };
-        }
-
-        // Flush the executor to ensure all the operations we just submitted have been executed
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        let flush_op = FlushOp { tx };
-        if let Err(_e) = self.tx.send(ExecutorMessage::Flush(flush_op)).await {
-            warn!("Shutting down");
-            return;
-        };
-        if rx.await.is_err() {
-            warn!("Shutting down");
-            return;
-        }
-
-        // After flush, we are assured that all contents of the pending lists
-        // are executed
-        self.pending_key_count = 0;
-        self.executed_lists.append(&mut self.pending_lists);
-
-        // Erase the lists we executed
-        self.cleanup_lists().await;
-    }
-
-    pub async fn background(&mut self) {
-        // TODO: if we would like to be able to defer deletions while a Layer still has
-        // refs (but it will be elegible for deletion after process ends), then we may
-        // add an ephemeral part to BackendQueueMessage::Delete that tracks which keys
-        // in the deletion list may not be deleted yet, with guards to block on while
-        // we wait to proceed.
-
-        loop {
-            let msg = match tokio::time::timeout(EXECUTE_IDLE_DEADLINE, self.rx.recv()).await {
-                Ok(Some(m)) => m,
-                Ok(None) => {
-                    // All queue senders closed
-                    info!("Shutting down");
-                    break;
-                }
-                Err(_) => {
-                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
-                    // return immediately if no work is pending
-                    self.flush().await;
-
-                    continue;
-                }
-            };
-
-            match msg {
-                BackendQueueMessage::Delete(list) => {
-                    self.pending_key_count += list.len();
-                    self.pending_lists.push(list);
-
-                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
-                        self.flush().await;
-                    }
-                }
-                BackendQueueMessage::Flush(op) => {
-                    self.flush().await;
-                    op.fire();
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/deletion_queue/executor.rs
+++ b/pageserver/src/deletion_queue/executor.rs
@@ -1,143 +0,0 @@
-use remote_storage::GenericRemoteStorage;
-use remote_storage::RemotePath;
-use remote_storage::MAX_KEYS_PER_DELETE;
-use std::time::Duration;
-use tokio_util::sync::CancellationToken;
-use tracing::info;
-use tracing::warn;
-
-use crate::metrics::DELETION_QUEUE_ERRORS;
-use crate::metrics::DELETION_QUEUE_EXECUTED;
-
-use super::DeletionQueueError;
-use super::FlushOp;
-
-const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
-
-pub(super) enum ExecutorMessage {
-    Delete(Vec<RemotePath>),
-    Flush(FlushOp),
-}
-
-/// Non-persistent deletion queue, for coalescing multiple object deletes into
-/// larger DeleteObjects requests.
-pub struct ExecutorWorker {
-    // Accumulate up to 1000 keys for the next deletion operation
-    accumulator: Vec<RemotePath>,
-
-    rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
-
-    cancel: CancellationToken,
-    remote_storage: GenericRemoteStorage,
-}
-
-impl ExecutorWorker {
-    pub(super) fn new(
-        remote_storage: GenericRemoteStorage,
-        rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            remote_storage,
-            rx,
-            cancel,
-            accumulator: Vec::new(),
-        }
-    }
-
-    /// Wrap the remote `delete_objects` with a failpoint
-    pub async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            DELETION_QUEUE_ERRORS
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
-        self.remote_storage.delete_objects(&self.accumulator).await
-    }
-
-    /// Block until everything in accumulator has been executed
-    pub async fn flush(&mut self) -> Result<(), DeletionQueueError> {
-        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
-            match self.remote_delete().await {
-                Ok(()) => {
-                    // Note: we assume that the remote storage layer returns Ok(()) if some
-                    // or all of the deleted objects were already gone.
-                    DELETION_QUEUE_EXECUTED.inc_by(self.accumulator.len() as u64);
-                    info!(
-                        "Executed deletion batch {}..{}",
-                        self.accumulator
-                            .first()
-                            .expect("accumulator should be non-empty"),
-                        self.accumulator
-                            .last()
-                            .expect("accumulator should be non-empty"),
-                    );
-                    self.accumulator.clear();
-                }
-                Err(e) => {
-                    warn!("DeleteObjects request failed: {e:#}, will retry");
-                    DELETION_QUEUE_ERRORS.with_label_values(&["execute"]).inc();
-                }
-            };
-        }
-        if self.cancel.is_cancelled() {
-            // Expose an error because we may not have actually flushed everything
-            Err(DeletionQueueError::ShuttingDown)
-        } else {
-            Ok(())
-        }
-    }
-
-    pub async fn background(&mut self) -> Result<(), DeletionQueueError> {
-        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
-
-        loop {
-            if self.cancel.is_cancelled() {
-                return Err(DeletionQueueError::ShuttingDown);
-            }
-
-            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
-                Ok(Some(m)) => m,
-                Ok(None) => {
-                    // All queue senders closed
-                    info!("Shutting down");
-                    return Err(DeletionQueueError::ShuttingDown);
-                }
-                Err(_) => {
-                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
-                    // return immediately if no work is pending
-                    self.flush().await?;
-
-                    continue;
-                }
-            };
-
-            match msg {
-                ExecutorMessage::Delete(mut list) => {
-                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                            self.flush().await?;
-                            // If we have received this number of keys, proceed with attempting to execute
-                            assert_eq!(self.accumulator.len(), 0);
-                        }
-
-                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
-                        let take_count = std::cmp::min(available_slots, list.len());
-                        for path in list.drain(list.len() - take_count..) {
-                            self.accumulator.push(path);
-                        }
-                    }
-                }
-                ExecutorMessage::Flush(flush_op) => {
-                    // If flush() errors, we drop the flush_op and the caller will get
-                    // an error recv()'ing their oneshot channel.
-                    self.flush().await?;
-                    flush_op.fire();
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/deletion_queue/frontend.rs
+++ b/pageserver/src/deletion_queue/frontend.rs
@@ -1,376 +0,0 @@
-use super::BackendQueueMessage;
-use super::DeletionHeader;
-use super::DeletionList;
-use super::FlushOp;
-
-use std::fs::create_dir_all;
-use std::time::Duration;
-
-use regex::Regex;
-use remote_storage::RemotePath;
-use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
-use utils::generation::Generation;
-use utils::id::TenantId;
-use utils::id::TimelineId;
-
-use crate::config::PageServerConf;
-use crate::metrics::DELETION_QUEUE_ERRORS;
-use crate::metrics::DELETION_QUEUE_SUBMITTED;
-use crate::tenant::remote_timeline_client::remote_layer_path;
-use crate::tenant::storage_layer::LayerFileName;
-
-// The number of keys in a DeletionList before we will proactively persist it
-// (without reaching a flush deadline).  This aims to deliver objects of the order
-// of magnitude 1MB when we are under heavy delete load.
-const DELETION_LIST_TARGET_SIZE: usize = 16384;
-
-// Ordinarily, we only flush to DeletionList periodically, to bound the window during
-// which we might leak objects from not flushing a DeletionList after
-// the objects are already unlinked from timeline metadata.
-const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
-
-// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
-// more objects before doing the flush.
-const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
-
-#[derive(Debug)]
-pub(super) struct DeletionOp {
-    pub(super) tenant_id: TenantId,
-    pub(super) timeline_id: TimelineId,
-    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
-    // have a config object handy to project it to a remote key, and need the consuming worker
-    // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, Generation)>,
-    pub(super) objects: Vec<RemotePath>,
-
-    /// The _current_ generation of the Tenant attachment in which we are enqueuing
-    /// this deletion.
-    pub(super) generation: Generation,
-}
-
-#[derive(Debug)]
-pub(super) enum FrontendQueueMessage {
-    Delete(DeletionOp),
-    // Wait until all prior deletions make it into a persistent DeletionList
-    Flush(FlushOp),
-    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
-    FlushExecute(FlushOp),
-}
-
-pub struct FrontendQueueWorker {
-    conf: &'static PageServerConf,
-
-    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
-
-    // Outbound requests to the backend to execute deletion lists we have composed.
-    tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
-
-    // The list we are currently building, contains a buffer of keys to delete
-    // and our next sequence number
-    pending: DeletionList,
-
-    // These FlushOps should fire the next time we flush
-    pending_flushes: Vec<FlushOp>,
-
-    // Worker loop is torn down when this fires.
-    cancel: CancellationToken,
-}
-
-impl FrontendQueueWorker {
-    pub(super) fn new(
-        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
-        tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            pending: DeletionList::new(1),
-            conf,
-            rx,
-            tx,
-            pending_flushes: Vec::new(),
-            cancel,
-        }
-    }
-    async fn upload_pending_list(&mut self) -> anyhow::Result<()> {
-        let path = self.conf.deletion_list_path(self.pending.sequence);
-
-        let bytes = serde_json::to_vec(&self.pending).expect("Failed to serialize deletion list");
-        tokio::fs::write(&path, &bytes).await?;
-        tokio::fs::File::open(&path).await?.sync_all().await?;
-        Ok(())
-    }
-
-    /// Try to flush `list` to persistent storage
-    ///
-    /// This does not return errors, because on failure to flush we do not lose
-    /// any state: flushing will be retried implicitly on the next deadline
-    async fn flush(&mut self) {
-        if self.pending.is_empty() {
-            for f in self.pending_flushes.drain(..) {
-                f.fire();
-            }
-            return;
-        }
-
-        match self.upload_pending_list().await {
-            Ok(_) => {
-                info!(sequence = self.pending.sequence, "Stored deletion list");
-
-                for f in self.pending_flushes.drain(..) {
-                    f.fire();
-                }
-
-                let onward_list = self.pending.drain();
-
-                // We have consumed out of pending: reset it for the next incoming deletions to accumulate there
-                self.pending = DeletionList::new(self.pending.sequence + 1);
-
-                if let Err(e) = self.tx.send(BackendQueueMessage::Delete(onward_list)).await {
-                    // This is allowed to fail: it will only happen if the backend worker is shut down,
-                    // so we can just drop this on the floor.
-                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
-                }
-            }
-            Err(e) => {
-                DELETION_QUEUE_ERRORS.with_label_values(&["put_list"]).inc();
-                warn!(
-                    sequence = self.pending.sequence,
-                    "Failed to write deletion list to remote storage, will retry later ({e:#})"
-                );
-            }
-        }
-    }
-
-    async fn recover(&mut self) -> Result<(), anyhow::Error> {
-        // Load header: this is not required to be present, e.g. when a pageserver first runs
-        let header_path = self.conf.deletion_header_path();
-
-        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        create_dir_all(&self.conf.deletion_prefix())?;
-
-        let header_bytes = match tokio::fs::read(&header_path).await {
-            Ok(h) => Ok(Some(h)),
-            Err(e) => {
-                if e.kind() == std::io::ErrorKind::NotFound {
-                    debug!(
-                        "Deletion header {0} not found, first start?",
-                        header_path.display()
-                    );
-                    Ok(None)
-                } else {
-                    Err(e)
-                }
-            }
-        }?;
-
-        if let Some(header_bytes) = header_bytes {
-            if let Some(header) = match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
-                Ok(h) => Some(h),
-                Err(e) => {
-                    warn!(
-                        "Failed to deserialize deletion header, ignoring {0}: {e:#}",
-                        header_path.display()
-                    );
-                    // This should never happen unless we make a mistake with our serialization.
-                    // Ignoring a deletion header is not consequential for correctnes because all deletions
-                    // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
-                    None
-                }
-            } {
-                self.pending.sequence =
-                    std::cmp::max(self.pending.sequence, header.last_deleted_list_seq + 1);
-            };
-        };
-
-        let mut dir = match tokio::fs::read_dir(&self.conf.deletion_prefix()).await {
-            Ok(d) => d,
-            Err(e) => {
-                warn!(
-                    "Failed to open deletion list directory {0}: {e:#}",
-                    header_path.display()
-                );
-
-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
-            }
-        };
-
-        let list_name_pattern = Regex::new("([a-zA-Z0-9]{16})-([a-zA-Z0-9]{2}).list").unwrap();
-
-        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await? {
-            let file_name = dentry.file_name().to_owned();
-            let basename = file_name.to_string_lossy();
-            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
-                m.get(1)
-                    .expect("Non optional group should be present")
-                    .as_str()
-            } else {
-                warn!("Unexpected key in deletion queue: {basename}");
-                continue;
-            };
-
-            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
-                Ok(s) => s,
-                Err(e) => {
-                    warn!("Malformed key '{basename}': {e}");
-                    continue;
-                }
-            };
-            seqs.push(seq);
-        }
-        seqs.sort();
-
-        // Initialize the next sequence number in the frontend based on the maximum of the highest list we see,
-        // and the last list that was deleted according to the header.  Combined with writing out the header
-        // prior to deletions, this guarnatees no re-use of sequence numbers.
-        if let Some(max_list_seq) = seqs.last() {
-            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
-        }
-
-        for s in seqs {
-            let list_path = self.conf.deletion_list_path(s);
-            let list_bytes = tokio::fs::read(&list_path).await?;
-
-            let deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
-                Ok(l) => l,
-                Err(e) => {
-                    // Drop the list on the floor: any objects it referenced will be left behind
-                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
-                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
-                    continue;
-                }
-            };
-
-            // We will drop out of recovery if this fails: it indicates that we are shutting down
-            // or the backend has panicked
-            DELETION_QUEUE_SUBMITTED.inc_by(deletion_list.len() as u64);
-            self.tx
-                .send(BackendQueueMessage::Delete(deletion_list))
-                .await?;
-        }
-
-        info!(next_sequence = self.pending.sequence, "Replay complete");
-
-        Ok(())
-    }
-
-    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
-    /// and write them out, for later
-    pub async fn background(&mut self) {
-        info!("Started deletion frontend worker");
-
-        let mut recovered: bool = false;
-
-        while !self.cancel.is_cancelled() {
-            let timeout = if self.pending_flushes.is_empty() {
-                FRONTEND_DEFAULT_TIMEOUT
-            } else {
-                FRONTEND_FLUSHING_TIMEOUT
-            };
-
-            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
-                Ok(Some(msg)) => msg,
-                Ok(None) => {
-                    // Queue sender destroyed, shutting down
-                    break;
-                }
-                Err(_) => {
-                    // Hit deadline, flush.
-                    self.flush().await;
-                    continue;
-                }
-            };
-
-            // On first message, do recovery.  This avoids unnecessary recovery very
-            // early in startup, and simplifies testing by avoiding a 404 reading the
-            // header on every first pageserver startup.
-            if !recovered {
-                // Before accepting any input from this pageserver lifetime, recover all deletion lists that are in S3
-                if let Err(e) = self.recover().await {
-                    // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
-                    // queue receiver has been dropped.
-                    info!("Deletion queue recover aborted, deletion queue will not proceed ({e})");
-                    return;
-                } else {
-                    recovered = true;
-                }
-            }
-
-            match msg {
-                FrontendQueueMessage::Delete(op) => {
-                    debug!(
-                        "Delete: ingesting {0} layers, {1} other objects",
-                        op.layers.len(),
-                        op.objects.len()
-                    );
-
-                    let mut layer_paths = Vec::new();
-                    for (layer, generation) in op.layers {
-                        layer_paths.push(remote_layer_path(
-                            &op.tenant_id,
-                            &op.timeline_id,
-                            &layer,
-                            generation,
-                        ));
-                    }
-                    layer_paths.extend(op.objects);
-
-                    if self.pending.push(
-                        &op.tenant_id,
-                        &op.timeline_id,
-                        op.generation,
-                        &mut layer_paths,
-                    ) == false
-                    {
-                        self.flush().await;
-                        let retry = self.pending.push(
-                            &op.tenant_id,
-                            &op.timeline_id,
-                            op.generation,
-                            &mut layer_paths,
-                        );
-                        if retry != true {
-                            // Unexpeted: after we flush, we should have
-                            // drained self.pending, so a conflict on
-                            // generation numbers should be impossible.
-                            tracing::error!(
-                                "Failed to enqueue deletions, leaking objects.  This is a bug."
-                            );
-                        }
-                    }
-                }
-                FrontendQueueMessage::Flush(op) => {
-                    if self.pending.is_empty() {
-                        // Execute immediately
-                        debug!("Flush: No pending objects, flushing immediately");
-                        op.fire()
-                    } else {
-                        // Execute next time we flush
-                        debug!("Flush: adding to pending flush list for next deadline flush");
-                        self.pending_flushes.push(op);
-                    }
-                }
-                FrontendQueueMessage::FlushExecute(op) => {
-                    debug!("FlushExecute: passing through to backend");
-                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
-                    if let Err(e) = self.tx.send(BackendQueueMessage::Flush(op)).await {
-                        info!("Can't flush, shutting down ({e})");
-                        // Caller will get error when their oneshot sender was dropped.
-                    }
-                }
-            }
-
-            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
-                self.flush().await;
-            }
-        }
-        info!("Deletion queue shut down.");
-    }
-}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,29 +52,6 @@ paths:
              schema:
                type: object

-  /v1/deletion_queue/flush:
-    parameters:
-      - name: execute
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description:
-          If true, attempt to execute deletions.  If false, just flush deletions to persistent deletion lists.
-    put:
-      description: Execute any deletions currently enqueued
-      security: []
-      responses:
-        "200":
-          description: |
-            Flush completed: if execute was true, then enqueued deletions have been completed.  If execute was false,
-            then enqueued deletions have been persisted to deletion lists, and may have been completed.
-          content:
-            application/json:
-              schema:
-                type: object
-
-
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,9 +8,10 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
+};
 use remote_storage::GenericRemoteStorage;
-use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -23,7 +24,6 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::{DeletionQueue, DeletionQueueError};
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -54,22 +54,20 @@ use utils::{
 // Imports only used for testing APIs
 use super::models::ConfigureFailpointsRequest;

-struct State {
+pub struct State {
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: DeletionQueue,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }

 impl State {
-    fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
-        deletion_queue: DeletionQueue,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    ) -> anyhow::Result<Self> {
@@ -83,7 +81,6 @@ impl State {
            allowlist_routes,
            remote_storage,
            broker_client,
-            deletion_queue,
            disk_usage_eviction_state,
        })
    }
@@ -288,6 +285,8 @@ async fn build_timeline_info_common(
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

+    let walreceiver_status = timeline.walreceiver_status();
+
    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
        timeline_id: timeline.timeline_id,
@@ -308,6 +307,8 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
+
+        walreceiver_status,
    };
    Ok(info)
 }
@@ -489,20 +490,7 @@ async fn tenant_attach_handler(

    let state = get_state(&request);

-    let generation = if state.conf.control_plane_api.is_some() {
-        // If we have been configured with a control plane URI, then generations are
-        // mandatory, as we will attempt to re-attach on startup.
-        maybe_body
-            .as_ref()
-            .map(|tar| tar.generation)
-            .flatten()
-            .map(|g| Generation::new(g))
-            .ok_or(ApiError::BadRequest(anyhow!(
-                "generation attribute missing"
-            )))?
-    } else {
-        Generation::none()
-    };
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;

    if let Some(remote_storage) = &state.remote_storage {
        mgr::attach_tenant(
@@ -512,7 +500,6 @@ async fn tenant_attach_handler(
            tenant_conf,
            state.broker_client.clone(),
            remote_storage.clone(),
-            &state.deletion_queue,
            &ctx,
        )
        .instrument(info_span!("tenant_attach", %tenant_id))
@@ -561,7 +548,7 @@ async fn tenant_detach_handler(
 }

 async fn tenant_load_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -569,13 +556,20 @@ async fn tenant_load_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

+    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
+
    let state = get_state(&request);
+
+    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
+    // should always use /attach instead.
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
+
    mgr::load_tenant(
        state.conf,
        tenant_id,
+        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
-        &state.deletion_queue,
        &ctx,
    )
    .instrument(info_span!("load", %tenant_id))
@@ -875,6 +869,21 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

+/// Helper for requests that may take a generation, which is mandatory
+/// when control_plane_api is set, but otherwise defaults to Generation::none()
+fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
+    if state.conf.control_plane_api.is_some() {
+        req_gen
+            .map(Generation::new)
+            .ok_or(ApiError::BadRequest(anyhow!(
+                "generation attribute missing"
+            )))
+    } else {
+        // Legacy mode: all tenants operate with no generation
+        Ok(Generation::none())
+    }
+}
+
 async fn tenant_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -891,16 +900,12 @@ async fn tenant_create_handler(
    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    // TODO: make generation mandatory here once control plane supports it.
-    let generation = request_data
-        .generation
-        .map(|g| Generation::new(g))
-        .unwrap_or(Generation::none());
+    let state = get_state(&request);
+
+    let generation = get_request_generation(state, request_data.generation)?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let state = get_state(&request);
-
    let new_tenant = mgr::create_tenant(
        state.conf,
        tenant_conf,
@@ -908,7 +913,6 @@ async fn tenant_create_handler(
        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
-        &state.deletion_queue,
        &ctx,
    )
    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1149,48 +1153,6 @@ async fn always_panic_handler(
    json_response(StatusCode::NO_CONTENT, ())
 }

-async fn deletion_queue_flush(
-    r: Request<Body>,
-    cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&r);
-
-    if state.remote_storage.is_none() {
-        // Nothing to do if remote storage is disabled.
-        return json_response(StatusCode::OK, ());
-    }
-
-    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
-
-    let queue_client = state.deletion_queue.new_client();
-
-    tokio::select! {
-        flush_result = async {
-            if execute {
-                queue_client.flush_execute().await
-            } else {
-                queue_client.flush().await
-            }
-        } => {
-            match flush_result {
-                Ok(())=> {
-                    json_response(StatusCode::OK, ())
-                },
-                Err(e) => {
-                    match e {
-                        DeletionQueueError::ShuttingDown => {
-            Err(ApiError::ShuttingDown)
-                        }
-                    }
-                }
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(ApiError::ShuttingDown)
-        }
-    }
-}
-
 async fn disk_usage_eviction_run(
    mut r: Request<Body>,
    _cancel: CancellationToken,
@@ -1395,13 +1357,9 @@ where
 }

 pub fn make_router(
-    conf: &'static PageServerConf,
+    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<JwtAuth>>,
-    broker_client: BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: DeletionQueue,
-    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1425,17 +1383,7 @@ pub fn make_router(
    );

    Ok(router
-        .data(Arc::new(
-            State::new(
-                conf,
-                auth,
-                remote_storage,
-                deletion_queue,
-                broker_client,
-                disk_usage_eviction_state,
-            )
-            .context("Failed to initialize router state")?,
-        ))
+        .data(state)
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
@@ -1515,9 +1463,6 @@ pub fn make_router(
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
-        .put("/v1/deletion_queue/flush", |r| {
-            api_handler(r, deletion_queue_flush)
-        })
        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,7 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-pub mod deletion_queue;
+mod control_plane_client;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -537,7 +537,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-/// Tracks time taken by fs operations near VirtualFile.
+/// VirtualFile fs operation variants.
 ///
 /// Operations:
 /// - open ([`std::fs::OpenOptions::open`])
@@ -548,15 +548,66 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 /// - seek (modify internal position or file length query)
 /// - fsync ([`std::fs::File::sync_all`])
 /// - metadata ([`std::fs::File::metadata`])
-pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_io_operations_seconds",
-        "Time spent in IO operations",
-        &["operation"],
-        STORAGE_IO_TIME_BUCKETS.into()
-    )
-    .expect("failed to define a metric")
-});
+#[derive(
+    Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
+)]
+pub(crate) enum StorageIoOperation {
+    Open,
+    Close,
+    CloseByReplace,
+    Read,
+    Write,
+    Seek,
+    Fsync,
+    Metadata,
+}
+
+impl StorageIoOperation {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            StorageIoOperation::Open => "open",
+            StorageIoOperation::Close => "close",
+            StorageIoOperation::CloseByReplace => "close-by-replace",
+            StorageIoOperation::Read => "read",
+            StorageIoOperation::Write => "write",
+            StorageIoOperation::Seek => "seek",
+            StorageIoOperation::Fsync => "fsync",
+            StorageIoOperation::Metadata => "metadata",
+        }
+    }
+}
+
+/// Tracks time taken by fs operations near VirtualFile.
+#[derive(Debug)]
+pub(crate) struct StorageIoTime {
+    metrics: [Histogram; StorageIoOperation::COUNT],
+}
+
+impl StorageIoTime {
+    fn new() -> Self {
+        let storage_io_histogram_vec = register_histogram_vec!(
+            "pageserver_io_operations_seconds",
+            "Time spent in IO operations",
+            &["operation"],
+            STORAGE_IO_TIME_BUCKETS.into()
+        )
+        .expect("failed to define a metric");
+        let metrics = std::array::from_fn(|i| {
+            let op = StorageIoOperation::from_repr(i).unwrap();
+            let metric = storage_io_histogram_vec
+                .get_metric_with_label_values(&[op.as_str()])
+                .unwrap();
+            metric
+        });
+        Self { metrics }
+    }
+
+    pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
+        &self.metrics[op as usize]
+    }
+}
+
+pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);

 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];

@@ -795,31 +846,6 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

-pub(crate) static DELETION_QUEUE_SUBMITTED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_deletion_queue_submitted_total",
-        "Number of objects submitted for deletion"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static DELETION_QUEUE_EXECUTED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static DELETION_QUEUE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_deletion_queue_errors_total",
-        "Incremented on retryable remote I/O errors writing deletion lists or executing deletions.",
-        &["op_kind"],
-    )
-    .expect("failed to define a metric")
-});
-
 static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_started",
@@ -1190,6 +1216,12 @@ impl TimelineMetrics {
            ),
        }
    }
+
+    pub fn record_new_file_metrics(&self, sz: u64) {
+        self.resident_physical_size_gauge.add(sz);
+        self.num_persistent_files_created.inc_by(1);
+        self.persistent_bytes_written.inc_by(sz);
+    }
 }

 impl Drop for TimelineMetrics {
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -799,8 +799,9 @@ impl PageCache {
    fn new(num_pages: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        // We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
-        // memory that Vec's might contain.
+        // We could use Vec::leak here, but that potentially also leaks
+        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
+        // this is avoided.
        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -469,7 +469,9 @@ impl PageServerHandler {
        // Create empty timeline
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
-        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .await?;

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -32,9 +32,7 @@ use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::fs::OpenOptions;
 use std::io;
-use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
@@ -59,7 +57,6 @@ use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::DeletionQueueClient;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -69,7 +66,7 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
-use crate::tenant::remote_timeline_client::index::IndexPart;
+pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
@@ -116,12 +113,11 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
-pub mod manifest;
 mod span;

 pub mod metadata;
 mod par_fsync;
-pub mod remote_timeline_client;
+mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
@@ -145,6 +141,9 @@ pub use crate::tenant::metadata::save_metadata;
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;

+/// The "tenants" part of `tenants/<tenant>/timelines...`
+pub const TENANTS_SEGMENT_NAME: &str = "tenants";
+
 /// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
 pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

@@ -158,7 +157,6 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: Option<GenericRemoteStorage>,
-    pub deletion_queue_client: DeletionQueueClient,
 }

 ///
@@ -182,7 +180,8 @@ pub struct Tenant {

    tenant_id: TenantId,

-    // The remote storage generation, used to protect S3 objects from split-brain
+    /// The remote storage generation, used to protect S3 objects from split-brain.
+    /// Does not change over the lifetime of the [`Tenant`] object.
    generation: Generation,

    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
@@ -196,10 +195,7 @@ pub struct Tenant {
    walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,

    // provides access to timeline data sitting in the remote storage
-    remote_storage: Option<GenericRemoteStorage>,
-
-    // Access to global deletion queue for when this tenant wants to schedule a deletion
-    deletion_queue_client: Option<DeletionQueueClient>,
+    pub(crate) remote_storage: Option<GenericRemoteStorage>,

    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -411,7 +407,6 @@ impl Tenant {
        remote_startup_data: Option<RemoteStartupData>,
        local_metadata: Option<TimelineMetadata>,
        ancestor: Option<Arc<Timeline>>,
-        first_save: bool,
        init_order: Option<&InitializationOrder>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -445,14 +440,9 @@ impl Tenant {

        // Save the metadata file to local disk.
        if !picked_local {
-            save_metadata(
-                self.conf,
-                &tenant_id,
-                &timeline_id,
-                up_to_date_metadata,
-                first_save,
-            )
-            .context("save_metadata")?;
+            save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
+                .await
+                .context("save_metadata")?;
        }

        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
@@ -536,7 +526,6 @@ impl Tenant {
        broker_client: storage_broker::BrokerClientChannel,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        remote_storage: GenericRemoteStorage,
-        deletion_queue_client: DeletionQueueClient,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        // TODO dedup with spawn_load
@@ -552,7 +541,6 @@ impl Tenant {
            tenant_id,
            generation,
            Some(remote_storage.clone()),
-            Some(deletion_queue_client),
        ));

        // Do all the hard work in the background
@@ -738,7 +726,6 @@ impl Tenant {
                remote_metadata,
                TimelineResources {
                    remote_client: Some(remote_client),
-                    deletion_queue_client: self.deletion_queue_client.clone(),
                },
                ctx,
            )
@@ -763,7 +750,6 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                Some(remote_timeline_client),
-                self.deletion_queue_client.clone(),
                None,
            )
            .await
@@ -841,7 +827,6 @@ impl Tenant {
            }),
            local_metadata,
            ancestor,
-            true,
            None,
            ctx,
        )
@@ -866,7 +851,6 @@ impl Tenant {
            tenant_id,
            Generation::broken(),
            None,
-            None,
        ))
    }

@@ -901,7 +885,6 @@ impl Tenant {

        let broker_client = resources.broker_client;
        let remote_storage = resources.remote_storage;
-        let deletion_queue_client = resources.deletion_queue_client;

        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Tenant::new(
@@ -912,7 +895,6 @@ impl Tenant {
            tenant_id,
            generation,
            remote_storage.clone(),
-            Some(deletion_queue_client),
        );
        let tenant = Arc::new(tenant);

@@ -1320,7 +1302,6 @@ impl Tenant {
                                timeline_id,
                                &local_metadata,
                                Some(remote_client),
-                                self.deletion_queue_client.clone(),
                                init_order,
                            )
                            .await
@@ -1370,7 +1351,6 @@ impl Tenant {
                        timeline_id,
                        &local_metadata,
                        None,
-                        None,
                        init_order,
                    )
                    .await
@@ -1399,7 +1379,6 @@ impl Tenant {
            remote_startup_data,
            Some(local_metadata),
            ancestor,
-            false,
            init_order,
            ctx,
        )
@@ -1463,7 +1442,7 @@ impl Tenant {
    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
    /// minimum amount of keys required to get a writable timeline.
    /// (Without it, `put` might fail due to `repartition` failing.)
-    pub fn create_empty_timeline(
+    pub async fn create_empty_timeline(
        &self,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
@@ -1475,10 +1454,10 @@ impl Tenant {
            "Cannot create empty timelines on inactive tenant"
        );

-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
-        drop(timelines);
-
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
+        };
        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
            // make it valid, before calling finish_creation()
@@ -1497,6 +1476,7 @@ impl Tenant {
            initdb_lsn,
            None,
        )
+        .await
    }

    /// Helper for unit tests to create an empty timeline.
@@ -1512,7 +1492,9 @@ impl Tenant {
        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?;
+        let uninit_tl = self
+            .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
+            .await?;
        let tline = uninit_tl.raw_timeline().expect("we just created it");
        assert_eq!(tline.get_last_record_lsn(), Lsn(0));

@@ -1530,6 +1512,15 @@ impl Tenant {
        tline.maybe_spawn_flush_loop();
        tline.freeze_and_flush().await.context("freeze_and_flush")?;

+        // Make sure the freeze_and_flush reaches remote storage.
+        tline
+            .remote_client
+            .as_ref()
+            .unwrap()
+            .wait_completion()
+            .await
+            .unwrap();
+
        let tl = uninit_tl.finish_creation()?;
        // The non-test code would call tl.activate() here.
        tl.set_state(TimelineState::Active);
@@ -1706,65 +1697,6 @@ impl Tenant {
        Ok(())
    }

-    /// Flush all in-memory data to disk and remote storage, if any.
-    ///
-    /// Used at graceful shutdown.
-    async fn freeze_and_flush_on_shutdown(&self) {
-        let mut js = tokio::task::JoinSet::new();
-
-        // execute on each timeline on the JoinSet, join after.
-        let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
-            async move {
-                debug_assert_current_span_has_tenant_and_timeline_id();
-
-                match timeline.freeze_and_flush().await {
-                    Ok(()) => {}
-                    Err(e) => {
-                        warn!("failed to freeze and flush: {e:#}");
-                        return;
-                    }
-                }
-
-                let res = if let Some(client) = timeline.remote_client.as_ref() {
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    client.wait_completion().await
-                } else {
-                    Ok(())
-                };
-
-                if let Err(e) = res {
-                    warn!("failed to await for frozen and flushed uploads: {e:#}");
-                }
-            }
-            .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
-        };
-
-        {
-            let timelines = self.timelines.lock().unwrap();
-            timelines
-                .iter()
-                .map(|(id, tl)| (*id, Arc::clone(tl)))
-                .for_each(|(timeline_id, timeline)| {
-                    js.spawn(per_timeline(timeline_id, timeline));
-                })
-        };
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
-                Err(je) if je.is_panic() => { /* logged already */ }
-                Err(je) => warn!("unexpected JoinError: {je:?}"),
-            }
-        }
-    }
-
    pub fn current_state(&self) -> TenantState {
        self.state.borrow().clone()
    }
@@ -1895,19 +1827,22 @@ impl Tenant {
            }
        };

-        if freeze_and_flush {
-            // walreceiver has already began to shutdown with TenantState::Stopping, but we need to
-            // await for them to stop.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::WalReceiverManager),
-                Some(self.tenant_id),
-                None,
-            )
-            .await;
-
-            // this will wait for uploads to complete; in the past, it was done outside tenant
-            // shutdown in pageserver::shutdown_pageserver.
-            self.freeze_and_flush_on_shutdown().await;
+        let mut js = tokio::task::JoinSet::new();
+        {
+            let timelines = self.timelines.lock().unwrap();
+            timelines.values().for_each(|timeline| {
+                let timeline = Arc::clone(timeline);
+                let span = Span::current();
+                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
+            })
+        };
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
+                Err(je) if je.is_panic() => { /* logged already */ }
+                Err(je) => warn!("unexpected JoinError: {je:?}"),
+            }
        }

        // shutdown all tenant and timeline tasks: gc, compaction, page service
@@ -2315,16 +2250,7 @@ impl Tenant {
        tenant_id: TenantId,
        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
-        deletion_queue_client: Option<DeletionQueueClient>,
    ) -> Tenant {
-        #[cfg(not(test))]
-        match state {
-            TenantState::Broken { .. } => {}
-            _ => {
-                // Non-broken tenants must be constructed with a deletion queue
-                assert!(deletion_queue_client.is_some());
-            }
-        }
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
@@ -2391,7 +2317,6 @@ impl Tenant {
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
-            deletion_queue_client,
            state,
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2444,72 +2369,37 @@ impl Tenant {
        Ok(tenant_conf)
    }

-    pub(super) fn persist_tenant_config(
+    #[tracing::instrument(skip_all, fields(%tenant_id))]
+    pub(super) async fn persist_tenant_config(
        tenant_id: &TenantId,
        target_config_path: &Path,
        tenant_conf: TenantConfOpt,
-        creating_tenant: bool,
    ) -> anyhow::Result<()> {
-        let _enter = info_span!("saving tenantconf").entered();
-
        // imitate a try-block with a closure
-        let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
-            let target_config_parent = target_config_path.parent().with_context(|| {
-                format!(
-                    "Config path does not have a parent: {}",
-                    target_config_path.display()
-                )
-            })?;
+        info!("persisting tenantconf to {}", target_config_path.display());

-            info!("persisting tenantconf to {}", target_config_path.display());
-
-            let mut conf_content = r#"# This file contains a specific per-tenant's config.
+        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.

 [tenant_config]
 "#
-            .to_string();
+        .to_string();

-            // Convert the config to a toml file.
-            conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
+        // Convert the config to a toml file.
+        conf_content += &toml_edit::ser::to_string(&tenant_conf)?;

-            let mut target_config_file = VirtualFile::open_with_options(
-                target_config_path,
-                OpenOptions::new()
-                    .truncate(true) // This needed for overwriting with small config files
-                    .write(true)
-                    .create_new(creating_tenant)
-                    // when creating a new tenant, first_save will be true and `.create(true)` will be
-                    // ignored (per rust std docs).
-                    //
-                    // later when updating the config of created tenant, or persisting config for the
-                    // first time for attached tenant, the `.create(true)` is used.
-                    .create(true),
-            )?;
+        let conf_content = conf_content.as_bytes();

-            target_config_file
-                .write(conf_content.as_bytes())
-                .context("write toml bytes into file")
-                .and_then(|_| target_config_file.sync_all().context("fsync config file"))
-                .context("write config file")?;
-
-            // fsync the parent directory to ensure the directory entry is durable.
-            // before this was done conditionally on creating_tenant, but these management actions are rare
-            // enough to just fsync it always.
-
-            crashsafe::fsync(target_config_parent)?;
-            // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
-            Ok(())
-        };
-
-        // this function is called from creating the tenant and updating the tenant config, which
-        // would otherwise share this context, so keep it here in one place.
-        do_persist(target_config_path).with_context(|| {
-            format!(
-                "write tenant {tenant_id} config to {}",
-                target_config_path.display()
-            )
-        })
+        let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
+        VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!(
+                    "write tenant {tenant_id} config to {}",
+                    target_config_path.display()
+                )
+            })?;
+        Ok(())
    }

    //
@@ -2820,13 +2710,15 @@ impl Tenant {
            src_timeline.pg_version,
        );

-        let uninitialized_timeline = self.prepare_new_timeline(
-            dst_id,
-            &metadata,
-            timeline_uninit_mark,
-            start_lsn + 1,
-            Some(Arc::clone(src_timeline)),
-        )?;
+        let uninitialized_timeline = self
+            .prepare_new_timeline(
+                dst_id,
+                &metadata,
+                timeline_uninit_mark,
+                start_lsn + 1,
+                Some(Arc::clone(src_timeline)),
+            )
+            .await?;

        let new_timeline = uninitialized_timeline.finish_creation()?;

@@ -2904,13 +2796,15 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let raw_timeline = self.prepare_new_timeline(
-            timeline_id,
-            &new_metadata,
-            timeline_uninit_mark,
-            pgdata_lsn,
-            None,
-        )?;
+        let raw_timeline = self
+            .prepare_new_timeline(
+                timeline_id,
+                &new_metadata,
+                timeline_uninit_mark,
+                pgdata_lsn,
+                None,
+            )
+            .await?;

        let tenant_id = raw_timeline.owning_tenant.tenant_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2972,10 +2866,7 @@ impl Tenant {
            None
        };

-        TimelineResources {
-            remote_client,
-            deletion_queue_client: self.deletion_queue_client.clone(),
-        }
+        TimelineResources { remote_client }
    }

    /// Creates intermediate timeline structure and its files.
@@ -2984,7 +2875,7 @@ impl Tenant {
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
    /// uninit mark file.
-    fn prepare_new_timeline(
+    async fn prepare_new_timeline(
        &self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
@@ -3012,8 +2903,9 @@ impl Tenant {

        timeline_struct.init_empty_layer_map(start_lsn);

-        if let Err(e) =
-            self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
+        if let Err(e) = self
+            .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
+            .await
        {
            error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
            cleanup_timeline_directory(uninit_mark);
@@ -3029,7 +2921,7 @@ impl Tenant {
        ))
    }

-    fn create_timeline_files(
+    async fn create_timeline_files(
        &self,
        timeline_path: &Path,
        new_timeline_id: &TimelineId,
@@ -3041,14 +2933,9 @@ impl Tenant {
            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
        });

-        save_metadata(
-            self.conf,
-            &self.tenant_id,
-            new_timeline_id,
-            new_metadata,
-            true,
-        )
-        .context("Failed to create timeline metadata")?;
+        save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata)
+            .await
+            .context("Failed to create timeline metadata")?;
        Ok(())
    }

@@ -3195,7 +3082,7 @@ pub(crate) enum CreateTenantFilesMode {
    Attach,
 }

-pub(crate) fn create_tenant_files(
+pub(crate) async fn create_tenant_files(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
@@ -3231,7 +3118,8 @@ pub(crate) fn create_tenant_files(
        mode,
        &temporary_tenant_dir,
        &target_tenant_directory,
-    );
+    )
+    .await;

    if creation_result.is_err() {
        error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
@@ -3249,7 +3137,7 @@ pub(crate) fn create_tenant_files(
    Ok(target_tenant_directory)
 }

-fn try_create_target_tenant_dir(
+async fn try_create_target_tenant_dir(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
@@ -3288,7 +3176,7 @@ fn try_create_target_tenant_dir(
    )
    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;

-    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
+    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;

    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
@@ -3493,6 +3381,8 @@ pub mod harness {
        pub tenant_conf: TenantConf,
        pub tenant_id: TenantId,
        pub generation: Generation,
+        pub remote_storage: GenericRemoteStorage,
+        pub remote_fs_dir: PathBuf,
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3530,30 +3420,39 @@ pub mod harness {
            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
            fs::create_dir_all(conf.timelines_path(&tenant_id))?;

+            use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
+            let remote_fs_dir = conf.workdir.join("localfs");
+            std::fs::create_dir_all(&remote_fs_dir).unwrap();
+            let config = RemoteStorageConfig {
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            };
+            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+
            Ok(Self {
                conf,
                tenant_conf,
                tenant_id,
                generation: Generation::new(0xdeadbeef),
+                remote_storage,
+                remote_fs_dir,
            })
        }

        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
-                self.try_load(&ctx, None, None)
+                self.try_load(&ctx)
                    .await
                    .expect("failed to load test tenant"),
                ctx,
            )
        }

-        pub async fn try_load(
-            &self,
-            ctx: &RequestContext,
-            remote_storage: Option<remote_storage::GenericRemoteStorage>,
-            deletion_queue_client: Option<DeletionQueueClient>,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
            let walredo_mgr = Arc::new(TestRedoManager);

            let tenant = Arc::new(Tenant::new(
@@ -3563,8 +3462,7 @@ pub mod harness {
                walredo_mgr,
                self.tenant_id,
                self.generation,
-                remote_storage,
-                deletion_queue_client,
+                Some(self.remote_storage.clone()),
            ));
            tenant
                .load(None, ctx)
@@ -3677,7 +3575,10 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) {
+        match tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+        {
            Ok(_) => panic!("duplicate timeline creation should fail"),
            Err(e) => assert_eq!(
                e.to_string(),
@@ -4032,6 +3933,13 @@ mod tests {
                .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
                .await?;
            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
+            // so that all uploads finish & we can call harness.load() below again
+            tenant
+                .shutdown(Default::default(), true)
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
+                .await
+                .ok()
+                .unwrap();
        }

        let (tenant, _ctx) = harness.load().await;
@@ -4065,6 +3973,14 @@ mod tests {
                .expect("Should have a local timeline");

            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+
+            // so that all uploads finish & we can call harness.load() below again
+            tenant
+                .shutdown(Default::default(), true)
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
+                .await
+                .ok()
+                .unwrap();
        }

        // check that both of them are initially unloaded
@@ -4117,6 +4033,13 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        drop(tline);
+        // so that all uploads finish & we can call harness.try_load() below again
+        tenant
+            .shutdown(Default::default(), true)
+            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
+            .await
+            .ok()
+            .unwrap();
        drop(tenant);

        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -4128,11 +4051,7 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness
-            .try_load(&ctx, None, None)
-            .await
-            .err()
-            .expect("should fail");
+        let err = harness.try_load(&ctx).await.err().expect("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
@@ -4517,8 +4436,9 @@ mod tests {
            .await;

        let initdb_lsn = Lsn(0x20);
-        let utline =
-            tenant.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)?;
+        let utline = tenant
+            .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let tline = utline.raw_timeline().unwrap();

        // Spawn flush loop now so that we can set the `expect_initdb_optimization`
@@ -4583,9 +4503,15 @@ mod tests {
        let harness = TenantHarness::create(name)?;
        {
            let (tenant, ctx) = harness.load().await;
-            let tline =
-                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tenant
+                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+                .await?;
            // Keeps uninit mark in place
+            let raw_tline = tline.raw_timeline().unwrap();
+            raw_tline
+                .shutdown(false)
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
+                .await;
            std::mem::forget(tline);
        }

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -13,6 +13,7 @@
 //!
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
+use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

@@ -83,35 +84,24 @@ impl<'a> BlockCursor<'a> {
    }
 }

+/// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
-/// Abstract trait for a data sink that you can write blobs to.
-///
-pub trait BlobWriter {
-    /// Write a blob of data. Returns the offset that it was written to,
-    /// which can be used to retrieve the data later.
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error>;
-}
-
-///
-/// An implementation of BlobWriter to write blobs to anything that
-/// implements std::io::Write.
-///
-pub struct WriteBlobWriter<W>
-where
-    W: std::io::Write,
-{
-    inner: W,
+/// If a `BlobWriter` is dropped, the internal buffer will be
+/// discarded. You need to call [`flush_buffer`](Self::flush_buffer)
+/// manually before dropping.
+pub struct BlobWriter<const BUFFERED: bool> {
+    inner: VirtualFile,
    offset: u64,
+    /// A buffer to save on write calls, only used if BUFFERED=true
+    buf: Vec<u8>,
 }

-impl<W> WriteBlobWriter<W>
-where
-    W: std::io::Write,
-{
-    pub fn new(inner: W, start_offset: u64) -> Self {
-        WriteBlobWriter {
+impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
+    pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
+        Self {
            inner,
            offset: start_offset,
+            buf: Vec::with_capacity(Self::CAPACITY),
        }
    }

@@ -119,28 +109,79 @@ where
        self.offset
    }

-    /// Access the underlying Write object.
-    ///
-    /// NOTE: WriteBlobWriter keeps track of the current write offset. If
-    /// you write something directly to the inner Write object, it makes the
-    /// internally tracked 'offset' to go out of sync. So don't do that.
-    pub fn into_inner(self) -> W {
-        self.inner
-    }
-}
+    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };

-impl<W> BlobWriter for WriteBlobWriter<W>
-where
-    W: std::io::Write,
-{
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+    #[inline(always)]
+    /// Writes the given buffer directly to the underlying `VirtualFile`.
+    /// You need to make sure that the internal buffer is empty, otherwise
+    /// data will be written in wrong order.
+    async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
+        self.inner.write_all(src_buf).await?;
+        self.offset += src_buf.len() as u64;
+        Ok(())
+    }
+
+    #[inline(always)]
+    /// Flushes the internal buffer to the underlying `VirtualFile`.
+    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
+        self.inner.write_all(&self.buf).await?;
+        self.buf.clear();
+        Ok(())
+    }
+
+    #[inline(always)]
+    /// Writes as much of `src_buf` into the internal buffer as it fits
+    fn write_into_buffer(&mut self, src_buf: &[u8]) -> usize {
+        let remaining = Self::CAPACITY - self.buf.len();
+        let to_copy = src_buf.len().min(remaining);
+        self.buf.extend_from_slice(&src_buf[..to_copy]);
+        self.offset += to_copy as u64;
+        to_copy
+    }
+
+    /// Internal, possibly buffered, write function
+    async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
+        if !BUFFERED {
+            assert!(self.buf.is_empty());
+            self.write_all_unbuffered(src_buf).await?;
+            return Ok(());
+        }
+        let remaining = Self::CAPACITY - self.buf.len();
+        // First try to copy as much as we can into the buffer
+        if remaining > 0 {
+            let copied = self.write_into_buffer(src_buf);
+            src_buf = &src_buf[copied..];
+        }
+        // Then, if the buffer is full, flush it out
+        if self.buf.len() == Self::CAPACITY {
+            self.flush_buffer().await?;
+        }
+        // Finally, write the tail of src_buf:
+        // If it wholly fits into the buffer without
+        // completely filling it, then put it there.
+        // If not, write it out directly.
+        if !src_buf.is_empty() {
+            assert_eq!(self.buf.len(), 0);
+            if src_buf.len() < Self::CAPACITY {
+                let copied = self.write_into_buffer(src_buf);
+                // We just verified above that src_buf fits into our internal buffer.
+                assert_eq!(copied, src_buf.len());
+            } else {
+                self.write_all_unbuffered(src_buf).await?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
        let offset = self.offset;

        if srcbuf.len() < 128 {
            // Short blob. Write a 1-byte length header
            let len_buf = srcbuf.len() as u8;
-            self.inner.write_all(&[len_buf])?;
-            self.offset += 1;
+            self.write_all(&[len_buf]).await?;
        } else {
            // Write a 4-byte length header
            if srcbuf.len() > 0x7fff_ffff {
@@ -151,11 +192,153 @@ where
            }
            let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
            len_buf[0] |= 0x80;
-            self.inner.write_all(&len_buf)?;
-            self.offset += 4;
+            self.write_all(&len_buf).await?;
        }
-        self.inner.write_all(srcbuf)?;
-        self.offset += srcbuf.len() as u64;
+        self.write_all(srcbuf).await?;
        Ok(offset)
    }
 }
+
+impl BlobWriter<true> {
+    /// Access the underlying `VirtualFile`.
+    ///
+    /// This function flushes the internal buffer before giving access
+    /// to the underlying `VirtualFile`.
+    pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
+        self.flush_buffer().await?;
+        Ok(self.inner)
+    }
+
+    /// Access the underlying `VirtualFile`.
+    ///
+    /// Unlike [`into_inner`](Self::into_inner), this doesn't flush
+    /// the internal buffer before giving access.
+    pub fn into_inner_no_flush(self) -> VirtualFile {
+        self.inner
+    }
+}
+
+impl BlobWriter<false> {
+    /// Access the underlying `VirtualFile`.
+    pub fn into_inner(self) -> VirtualFile {
+        self.inner
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tenant::block_io::BlockReaderRef;
+    use rand::{Rng, SeedableRng};
+
+    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
+        let temp_dir = tempfile::tempdir()?;
+        let path = temp_dir.path().join("file");
+
+        // Write part (in block to drop the file)
+        let mut offsets = Vec::new();
+        {
+            let file = VirtualFile::create(&path).await?;
+            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
+            for blob in blobs.iter() {
+                let offs = wtr.write_blob(blob).await?;
+                offsets.push(offs);
+            }
+            // Write out one page worth of zeros so that we can
+            // read again with read_blk
+            let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
+            println!("Writing final blob at offs={offs}");
+            wtr.flush_buffer().await?;
+        }
+
+        let file = VirtualFile::open(&path).await?;
+        let rdr = BlockReaderRef::VirtualFile(&file);
+        let rdr = BlockCursor::new(rdr);
+        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
+            let blob_read = rdr.read_blob(*offset).await?;
+            assert_eq!(
+                blob, &blob_read,
+                "mismatch for idx={idx} at offset={offset}"
+            );
+        }
+        Ok(())
+    }
+
+    fn random_array(len: usize) -> Vec<u8> {
+        let mut rng = rand::thread_rng();
+        (0..len).map(|_| rng.gen()).collect::<_>()
+    }
+
+    #[tokio::test]
+    async fn test_one() -> Result<(), Error> {
+        let blobs = &[vec![12, 21, 22]];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_hello_simple() -> Result<(), Error> {
+        let blobs = &[
+            vec![0, 1, 2, 3],
+            b"Hello, World!".to_vec(),
+            Vec::new(),
+            b"foobar".to_vec(),
+        ];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_really_big_array() -> Result<(), Error> {
+        let blobs = &[
+            b"test".to_vec(),
+            random_array(10 * PAGE_SZ),
+            b"foobar".to_vec(),
+        ];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_inc() -> Result<(), Error> {
+        let blobs = (0..PAGE_SZ / 8)
+            .map(|v| random_array(v * 16))
+            .collect::<Vec<_>>();
+        round_trip_test::<false>(&blobs).await?;
+        round_trip_test::<true>(&blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_random_size() -> Result<(), Error> {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
+        let blobs = (0..1024)
+            .map(|_| {
+                let mut sz: u16 = rng.gen();
+                // Make 50% of the arrays small
+                if rng.gen() {
+                    sz |= 63;
+                }
+                random_array(sz.into())
+            })
+            .collect::<Vec<_>>();
+        round_trip_test::<false>(&blobs).await?;
+        round_trip_test::<true>(&blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_page_boundary() -> Result<(), Error> {
+        let blobs = &[
+            random_array(PAGE_SZ - 4),
+            random_array(PAGE_SZ - 4),
+            random_array(PAGE_SZ - 4),
+        ];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -7,9 +7,7 @@ use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
-use std::fs::File;
 use std::ops::{Deref, DerefMut};
-use std::os::unix::fs::FileExt;

 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -73,12 +71,13 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
-    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
+    FileBlockReader(&'a FileBlockReader),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
+    #[cfg(test)]
+    VirtualFile(&'a VirtualFile),
 }

 impl<'a> BlockReaderRef<'a> {
@@ -86,12 +85,13 @@ impl<'a> BlockReaderRef<'a> {
    async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        use BlockReaderRef::*;
        match self {
-            FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
-            FileBlockReaderFile(r) => r.read_blk(blknum).await,
+            FileBlockReader(r) => r.read_blk(blknum).await,
            EphemeralFile(r) => r.read_blk(blknum).await,
            Adapter(r) => r.read_blk(blknum).await,
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
+            #[cfg(test)]
+            VirtualFile(r) => r.read_blk(blknum).await,
        }
    }
 }
@@ -105,7 +105,7 @@ impl<'a> BlockReaderRef<'a> {
 ///
 /// ```no_run
 /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
-/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
+/// # let reader: FileBlockReader = unimplemented!("stub");
 /// let cursor = reader.block_cursor();
 /// let buf = cursor.read_blk(1);
 /// // do stuff with 'buf'
@@ -122,9 +122,9 @@ impl<'a> BlockCursor<'a> {
        BlockCursor { reader }
    }
    // Needed by cli
-    pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
+    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
        BlockCursor {
-            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
+            reader: BlockReaderRef::FileBlockReader(reader),
        }
    }

@@ -143,27 +143,26 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader<F> {
-    pub file: F,
+pub struct FileBlockReader {
+    pub file: VirtualFile,

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
 }

-impl<F> FileBlockReader<F>
-where
-    F: FileExt,
-{
-    pub fn new(file: F) -> Self {
+impl FileBlockReader {
+    pub fn new(file: VirtualFile) -> Self {
        let file_id = page_cache::next_file_id();

        FileBlockReader { file_id, file }
    }

    /// Read a page from the underlying file into given buffer.
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+    async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
        assert!(buf.len() == PAGE_SZ);
-        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+        self.file
+            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+            .await
    }
    /// Read a block.
    ///
@@ -185,7 +184,7 @@ where
                ReadBufResult::Found(guard) => break Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
+                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
                    write_guard.mark_valid();

                    // Swap for read lock
@@ -196,15 +195,9 @@ where
    }
 }

-impl BlockReader for FileBlockReader<File> {
+impl BlockReader for FileBlockReader {
    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
-    }
-}
-
-impl BlockReader for FileBlockReader<VirtualFile> {
-    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
+        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
    }
 }

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -9,7 +9,6 @@ use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
-use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
@@ -29,7 +28,7 @@ pub struct EphemeralFile {
 }

 impl EphemeralFile {
-    pub fn create(
+    pub async fn create(
        conf: &PageServerConf,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -45,7 +44,8 @@ impl EphemeralFile {
        let file = VirtualFile::open_with_options(
            &filename,
            OpenOptions::new().read(true).write(true).create(true),
-        )?;
+        )
+        .await?;

        Ok(EphemeralFile {
            page_cache_file_id: page_cache::next_file_id(),
@@ -88,7 +88,8 @@ impl EphemeralFile {
                        let buf: &mut [u8] = write_guard.deref_mut();
                        debug_assert_eq!(buf.len(), PAGE_SZ);
                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                            .await?;
                        write_guard.mark_valid();

                        // Swap for read lock
@@ -128,10 +129,15 @@ impl EphemeralFile {
                    self.off += n;
                    src_remaining = &src_remaining[n..];
                    if self.off == PAGE_SZ {
-                        match self.ephemeral_file.file.write_all_at(
-                            &self.ephemeral_file.mutable_tail,
-                            self.blknum as u64 * PAGE_SZ as u64,
-                        ) {
+                        match self
+                            .ephemeral_file
+                            .file
+                            .write_all_at(
+                                &self.ephemeral_file.mutable_tail,
+                                self.blknum as u64 * PAGE_SZ as u64,
+                            )
+                            .await
+                        {
                            Ok(_) => {
                                // Pre-warm the page cache with what we just wrote.
                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
@@ -281,7 +287,7 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;

        let pos_foo = file.write_blob(b"foo").await?;
        assert_eq!(
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -1,325 +0,0 @@
-//! This module contains the encoding and decoding of the local manifest file.
-//!
-//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
-//! records the state of the storage engine. It contains a snapshot of the
-//! state and all operations proceeding that snapshot. The file begins with a
-//! header recording MANIFEST version number. After that, it contains a snapshot.
-//! The snapshot is followed by a list of operations. Each operation is a list
-//! of records. Each record is either an addition or a removal of a layer.
-//!
-//! With MANIFEST, we can:
-//!
-//! 1. recover state quickly by reading the file, potentially boosting the
-//!    startup speed.
-//! 2. ensure all operations are atomic and avoid corruption, solving issues
-//!    like redundant image layer and preparing us for future compaction
-//!    strategies.
-//!
-//! There is also a format for storing all layer files on S3, called
-//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
-//! records all operations as logs, and therefore we can easily replay the
-//! operations when recovering from crash, while ensuring those operations
-//! are atomic upon restart.
-//!
-//! Currently, this is not used in the system. Future refactors will ensure
-//! the storage state will be recorded in this file, and the system can be
-//! recovered from this file. This is tracked in
-//! <https://github.com/neondatabase/neon/issues/4418>
-
-use std::io::{self, Read, Write};
-
-use crate::virtual_file::VirtualFile;
-use anyhow::Result;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use crc32c::crc32c;
-use serde::{Deserialize, Serialize};
-use tracing::log::warn;
-use utils::lsn::Lsn;
-
-use super::storage_layer::PersistentLayerDesc;
-
-pub struct Manifest {
-    file: VirtualFile,
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct Snapshot {
-    pub layers: Vec<PersistentLayerDesc>,
-}
-
-/// serde by default encode this in tagged enum, and therefore it will be something
-/// like `{ "AddLayer": { ... } }`.
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Record {
-    AddLayer(PersistentLayerDesc),
-    RemoveLayer(PersistentLayerDesc),
-}
-
-/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
-const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
-const MANIFEST_VERSION: u64 = 1;
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct ManifestHeader {
-    magic_number: u64,
-    version: u64,
-}
-
-const MANIFEST_HEADER_LEN: usize = 16;
-
-impl ManifestHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
-        buf.put_u64(self.magic_number);
-        buf.put_u64(self.version);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
-        Self {
-            magic_number: buf.get_u64(),
-            version: buf.get_u64(),
-        }
-    }
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Operation {
-    /// A snapshot of the current state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
-    Snapshot(Snapshot, Lsn),
-    /// An atomic operation that changes the state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
-    /// This will only change when new L0 is flushed to the disk.
-    Operation(Vec<Record>, Lsn),
-}
-
-struct RecordHeader {
-    size: u32,
-    checksum: u32,
-}
-
-const RECORD_HEADER_LEN: usize = 8;
-
-impl RecordHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
-        buf.put_u32(self.size);
-        buf.put_u32(self.checksum);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
-        Self {
-            size: buf.get_u32(),
-            checksum: buf.get_u32(),
-        }
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum ManifestLoadError {
-    #[error("manifest header is corrupted")]
-    CorruptedManifestHeader,
-    #[error("unsupported manifest version: got {0}, expected {1}")]
-    UnsupportedVersion(u64, u64),
-    #[error("error when decoding record: {0}")]
-    DecodeRecord(serde_json::Error),
-    #[error("I/O error: {0}")]
-    Io(io::Error),
-}
-
-#[must_use = "Should check if the manifest is partially corrupted"]
-pub struct ManifestPartiallyCorrupted(bool);
-
-impl Manifest {
-    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
-    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
-        let mut manifest = Self { file };
-        manifest.append_manifest_header(ManifestHeader {
-            magic_number: MANIFEST_MAGIC_NUMBER,
-            version: MANIFEST_VERSION,
-        })?;
-        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
-        Ok(manifest)
-    }
-
-    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
-    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
-    /// backup the current one.
-    pub fn load(
-        mut file: VirtualFile,
-    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
-        let mut buf = vec![];
-        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
-
-        // Read manifest header
-        let mut buf = Bytes::from(buf);
-        if buf.remaining() < MANIFEST_HEADER_LEN {
-            return Err(ManifestLoadError::CorruptedManifestHeader);
-        }
-        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
-        buf.advance(MANIFEST_HEADER_LEN);
-        if header.version != MANIFEST_VERSION {
-            return Err(ManifestLoadError::UnsupportedVersion(
-                header.version,
-                MANIFEST_VERSION,
-            ));
-        }
-
-        // Read operations
-        let mut operations = Vec::new();
-        let corrupted = loop {
-            if buf.remaining() == 0 {
-                break false;
-            }
-            if buf.remaining() < RECORD_HEADER_LEN {
-                warn!("incomplete header when decoding manifest, could be corrupted");
-                break true;
-            }
-            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
-            let size = size as usize;
-            buf.advance(RECORD_HEADER_LEN);
-            if buf.remaining() < size {
-                warn!("incomplete data when decoding manifest, could be corrupted");
-                break true;
-            }
-            let data = &buf[..size];
-            if crc32c(data) != checksum {
-                warn!("checksum mismatch when decoding manifest, could be corrupted");
-                break true;
-            }
-            // if the following decode fails, we cannot use the manifest or safely ignore any record.
-            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
-            buf.advance(size);
-        };
-        Ok((
-            Self { file },
-            operations,
-            ManifestPartiallyCorrupted(corrupted),
-        ))
-    }
-
-    fn append_data(&mut self, data: &[u8]) -> Result<()> {
-        if data.len() >= u32::MAX as usize {
-            panic!("data too large");
-        }
-        let header = RecordHeader {
-            size: data.len() as u32,
-            checksum: crc32c(data),
-        };
-        let header = header.encode();
-        self.file.write_all(&header)?;
-        self.file.write_all(data)?;
-        self.file.sync_all()?;
-        Ok(())
-    }
-
-    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
-        let encoded = header.encode();
-        self.file.write_all(&encoded)?;
-        Ok(())
-    }
-
-    /// Add an operation to the manifest. The operation will be appended to the end of the file,
-    /// and the file will fsync.
-    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
-        let encoded = Vec::from(serde_json::to_string(&operation)?);
-        self.append_data(&encoded)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::fs::OpenOptions;
-
-    use crate::repository::Key;
-
-    use super::*;
-
-    #[test]
-    fn test_read_manifest() {
-        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
-        std::fs::create_dir_all(&testdir).unwrap();
-        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
-        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
-        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
-        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
-        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
-
-        // Write a manifest with a snapshot and some operations
-        let snapshot = Snapshot {
-            layers: vec![layer1, layer2],
-        };
-        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
-        manifest
-            .append_operation(Operation::Operation(
-                vec![Record::AddLayer(layer3.clone())],
-                Lsn::from(1),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the second time and write
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 2);
-        assert_eq!(
-            &operations[0],
-            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
-        );
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        manifest
-            .append_operation(Operation::Operation(
-                vec![
-                    Record::RemoveLayer(layer3.clone()),
-                    Record::AddLayer(layer4.clone()),
-                ],
-                Lsn::from(2),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the third time and verify
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 3);
-        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        assert_eq!(
-            &operations[2],
-            &Operation::Operation(
-                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
-                Lsn::from(2)
-            )
-        );
-    }
-}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -8,14 +8,13 @@
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client

-use std::fs::{File, OpenOptions};
-use std::io::{self, Write};
+use std::io::{self};

-use anyhow::{bail, ensure, Context};
+use anyhow::{ensure, Context};
 use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
-use tracing::info_span;
 use utils::bin_ser::SerializeError;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::{
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
@@ -24,6 +23,7 @@ use utils::{

 use crate::config::PageServerConf;
 use crate::virtual_file::VirtualFile;
+use crate::TEMP_FILE_SUFFIX;

 /// Use special format number to enable backward compatibility.
 const METADATA_FORMAT_VERSION: u16 = 4;
@@ -230,6 +230,23 @@ impl TimelineMetadata {
    pub fn pg_version(&self) -> u32 {
        self.body.pg_version
    }
+
+    // Checksums make it awkward to build a valid instance by hand.  This helper
+    // provides a TimelineMetadata with a valid checksum in its header.
+    #[cfg(test)]
+    pub fn example() -> Self {
+        let instance = Self::new(
+            "0/16960E8".parse::<Lsn>().unwrap(),
+            None,
+            None,
+            Lsn::from_hex("00000000").unwrap(),
+            Lsn::from_hex("00000000").unwrap(),
+            Lsn::from_hex("00000000").unwrap(),
+            0,
+        );
+        let bytes = instance.to_bytes().unwrap();
+        Self::from_bytes(&bytes).unwrap()
+    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -255,38 +272,19 @@ impl Serialize for TimelineMetadata {
 }

 /// Save timeline metadata to file
-pub fn save_metadata(
+#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
+pub async fn save_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    data: &TimelineMetadata,
-    first_save: bool,
 ) -> anyhow::Result<()> {
-    let _enter = info_span!("saving metadata").entered();
    let path = conf.metadata_path(tenant_id, timeline_id);
-    // use OpenOptions to ensure file presence is consistent with first_save
-    let mut file = VirtualFile::open_with_options(
-        &path,
-        OpenOptions::new().write(true).create_new(first_save),
-    )
-    .context("open_with_options")?;
-
-    let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
-
-    if file.write(&metadata_bytes)? != metadata_bytes.len() {
-        bail!("Could not write all the metadata bytes in a single call");
-    }
-    file.sync_all()?;
-
-    // fsync the parent directory to ensure the directory entry is durable
-    if first_save {
-        let timeline_dir = File::open(
-            path.parent()
-                .expect("Metadata should always have a parent dir"),
-        )?;
-        timeline_dir.sync_all()?;
-    }
-
+    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
+    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
+    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
+        .await
+        .context("write metadata")?;
    Ok(())
 }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1,19 +1,18 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use hyper::StatusCode;
-use pageserver_api::control_api::{HexTenantId, ReAttachRequest, ReAttachResponse};
+use rand::{distributions::Alphanumeric, Rng};
 use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use std::time::Duration;
 use tokio::fs;

 use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
 use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
@@ -21,13 +20,14 @@ use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::DeletionQueue;
+use crate::control_plane_client::ControlPlaneClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

+use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
@@ -64,6 +64,39 @@ impl TenantsMap {
    }
 }

+/// This is "safe" in that that it won't leave behind a partially deleted directory
+/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
+/// the contents.
+///
+/// This is pageserver-specific, as it relies on future processes after a crash to check
+/// for TEMP_FILE_SUFFIX when loading things.
+async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
+    let tmp_path = safe_rename_tenant_dir(path).await?;
+    fs::remove_dir_all(tmp_path).await
+}
+
+async fn safe_rename_tenant_dir(path: impl AsRef<Path>) -> std::io::Result<PathBuf> {
+    let parent = path
+        .as_ref()
+        .parent()
+        // It is invalid to call this function with a relative path.  Tenant directories
+        // should always have a parent.
+        .ok_or(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            "Path must be absolute",
+        ))?;
+    let rand_suffix = rand::thread_rng()
+        .sample_iter(&Alphanumeric)
+        .take(8)
+        .map(char::from)
+        .collect::<String>()
+        + TEMP_FILE_SUFFIX;
+    let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
+    fs::rename(&path, &tmp_path).await?;
+    fs::File::open(parent).await?.sync_all().await?;
+    Ok(tmp_path)
+}
+
 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));

 /// Initialize repositories with locally available timelines.
@@ -74,82 +107,19 @@ pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    resources: TenantSharedResources,
    init_order: InitializationOrder,
+    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();

    let mut tenants = HashMap::new();

-    // If we are configured to use the control plane API, then it is the source of truth for what to attach
-    let tenant_generations = conf
-        .control_plane_api
-        .as_ref()
-        .map(|control_plane_api| async {
-            let client = reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client");
-
-            // FIXME: it's awkward that join() requires the base to have a trailing slash, makes
-            // it easy to get a config wrong
-            assert!(
-                control_plane_api.as_str().ends_with("/"),
-                "control plane API needs trailing slash"
-            );
-
-            let re_attach_path = control_plane_api
-                .join("re-attach")
-                .expect("Failed to build re-attach path");
-            let request = ReAttachRequest { node_id: conf.id };
-
-            // TODO: we should have been passed a cancellation token, and use it to end
-            // this loop gracefully
-            loop {
-                let response = match client
-                    .post(re_attach_path.clone())
-                    .json(&request)
-                    .send()
-                    .await
-                {
-                    Err(e) => Err(anyhow::Error::from(e)),
-                    Ok(r) => {
-                        if r.status() == StatusCode::OK {
-                            r.json::<ReAttachResponse>()
-                                .await
-                                .map_err(|e| anyhow::Error::from(e))
-                        } else {
-                            Err(anyhow::anyhow!("Unexpected status {}", r.status()))
-                        }
-                    }
-                };
-
-                match response {
-                    Ok(res) => {
-                        tracing::info!(
-                            "Received re-attach response with {0} tenants",
-                            res.tenants.len()
-                        );
-
-                        // TODO: do something with it
-                        break res
-                            .tenants
-                            .into_iter()
-                            .map(|t| (t.id, t.generation))
-                            .collect::<HashMap<_, _>>();
-                    }
-                    Err(e) => {
-                        tracing::error!("Error re-attaching tenants, retrying: {e:#}");
-                        tokio::time::sleep(Duration::from_secs(1)).await;
-                    }
-                }
-            }
-        });
-
-    let tenant_generations = match tenant_generations {
-        Some(g) => Some(g.await),
-        None => {
-            info!("Control plane API not configured, tenant generations are disabled");
-            None
-        }
+    // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
+    let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
+        Some(client.re_attach().await?)
+    } else {
+        info!("Control plane API not configured, tenant generations are disabled");
+        None
    };

    let mut dir_entries = fs::read_dir(&tenants_dir)
@@ -168,6 +138,8 @@ pub async fn init_tenant_mgr(
                        "Found temporary tenant directory, removing: {}",
                        tenant_dir_path.display()
                    );
+                    // No need to use safe_remove_tenant_dir_all because this is already
+                    // a temporary path
                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
                        error!(
                            "Failed to remove temporary directory '{}': {:?}",
@@ -208,7 +180,7 @@ pub async fn init_tenant_mgr(
                        Ok(id) => id,
                        Err(_) => {
                            warn!(
-                                "Invalid tenant path (garbage in our repo directory?): {0}",
+                                "Invalid tenant path (garbage in our repo directory?): {}",
                                tenant_dir_path.display()
                            );
                            continue;
@@ -218,11 +190,11 @@ pub async fn init_tenant_mgr(
                    let generation = if let Some(generations) = &tenant_generations {
                        // We have a generation map: treat it as the authority for whether
                        // this tenant is really attached.
-                        if let Some(gen) = generations.get(&HexTenantId::new(tenant_id)) {
-                            Generation::new(*gen)
+                        if let Some(gen) = generations.get(&tenant_id) {
+                            *gen
                        } else {
-                            info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id);
-                            if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
+                            info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
+                            if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
                                error!(
                                    "Failed to remove detached tenant directory '{}': {:?}",
                                    tenant_dir_path.display(),
@@ -235,7 +207,7 @@ pub async fn init_tenant_mgr(
                        // Legacy mode: no generation information, any tenant present
                        // on local disk may activate
                        info!(
-                            "Starting tenant {0} in legacy mode, no generation",
+                            "Starting tenant {} in legacy mode, no generation",
                            tenant_dir_path.display()
                        );
                        Generation::none()
@@ -279,6 +251,7 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

+#[allow(clippy::too_many_arguments)]
 pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -320,7 +293,6 @@ pub(crate) fn schedule_local_tenant_processing(
                resources.broker_client,
                tenants,
                remote_storage,
-                resources.deletion_queue_client,
                ctx,
            ) {
                Ok(tenant) => tenant,
@@ -468,21 +440,19 @@ pub async fn create_tenant(
    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: &DeletionQueue,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || {
+    tenant_map_insert(tenant_id, || async {
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

        let tenant_resources = TenantSharedResources {
            broker_client,
            remote_storage,
-            deletion_queue_client: deletion_queue.new_client(),
        };
        let created_tenant =
            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
@@ -516,7 +486,8 @@ pub async fn set_new_tenant_config(
    let tenant = get_tenant(tenant_id, true).await?;

    let tenant_config_path = conf.tenant_config_path(&tenant_id);
-    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
+    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
+        .await
        .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    Ok(())
@@ -532,6 +503,8 @@ pub enum GetTenantError {

 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+///
+/// This method is cancel-safe.
 pub async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
@@ -591,7 +564,24 @@ pub async fn detach_tenant(
    tenant_id: TenantId,
    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
-    detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
+    let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?;
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
+    Ok(())
 }

 async fn detach_tenant0(
@@ -599,20 +589,16 @@ async fn detach_tenant0(
    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    detach_ignored: bool,
-) -> Result<(), TenantStateError> {
-    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
+) -> Result<PathBuf, TenantStateError> {
+    let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        fs::remove_dir_all(&local_tenant_directory)
+        safe_rename_tenant_dir(&local_tenant_directory)
            .await
-            .with_context(|| {
-                format!("local tenant directory {local_tenant_directory:?} removal")
-            })?;
-        Ok(())
+            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
    };

    let removal_result =
-        remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
-            .await;
+        remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;

    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -620,10 +606,10 @@ async fn detach_tenant0(
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
        if tenant_ignore_mark.exists() {
            info!("Detaching an ignored tenant");
-            local_files_cleanup_operation(tenant_id)
+            let tmp_path = tenant_dir_rename_operation(tenant_id)
                .await
-                .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
-            return Ok(());
+                .with_context(|| format!("Ignored tenant {tenant_id} local directory rename"))?;
+            return Ok(tmp_path);
        }
    }

@@ -633,12 +619,12 @@ async fn detach_tenant0(
 pub async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
+    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: &DeletionQueue,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || {
+    tenant_map_insert(tenant_id, || async {
        let tenant_path = conf.tenant_path(&tenant_id);
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
        if tenant_ignore_mark.exists() {
@@ -649,11 +635,8 @@ pub async fn load_tenant(
        let resources = TenantSharedResources {
            broker_client,
            remote_storage,
-            deletion_queue_client: deletion_queue.new_client(),
        };
-        // TODO: remove the `/load` API once generation support is complete:
-        // it becomes equivalent to attaching.
-        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, Generation::none(), resources, None,  &TENANTS, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -721,11 +704,10 @@ pub async fn attach_tenant(
    tenant_conf: TenantConfOpt,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: GenericRemoteStorage,
-    deletion_queue: &DeletionQueue,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || {
-        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
+    tenant_map_insert(tenant_id, || async {
+        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

@@ -739,7 +721,6 @@ pub async fn attach_tenant(
        let resources = TenantSharedResources {
            broker_client,
            remote_storage: Some(remote_storage),
-            deletion_queue_client: deletion_queue.new_client(),
        };
        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
@@ -774,12 +755,13 @@ pub enum TenantMapInsertError {
 ///
 /// NB: the closure should return quickly because the current implementation of tenants map
 /// serializes access through an `RwLock`.
-async fn tenant_map_insert<F>(
+async fn tenant_map_insert<F, R>(
    tenant_id: TenantId,
    insert_fn: F,
 ) -> Result<Arc<Tenant>, TenantMapInsertError>
 where
-    F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
+    F: FnOnce() -> R,
+    R: std::future::Future<Output = anyhow::Result<Arc<Tenant>>>,
 {
    let mut guard = TENANTS.write().await;
    let m = match &mut *guard {
@@ -792,7 +774,7 @@ where
            tenant_id,
            e.get().current_state(),
        )),
-        hash_map::Entry::Vacant(v) => match insert_fn() {
+        hash_map::Entry::Vacant(v) => match insert_fn().await {
            Ok(tenant) => {
                v.insert(tenant.clone());
                Ok(tenant)
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -4,10 +4,9 @@ use std::{
    sync::atomic::{AtomicUsize, Ordering},
 };

-use crate::virtual_file::VirtualFile;
-
 fn fsync_path(path: &Path) -> io::Result<()> {
-    let file = VirtualFile::open(path)?;
+    // TODO use VirtualFile::fsync_all once we fully go async.
+    let file = std::fs::File::open(path)?;
    file.sync_all()
 }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -56,11 +56,9 @@
 //! # Consistency
 //!
 //! To have a consistent remote structure, it's important that uploads and
-//! deletions are performed in the right order. For example:
-//! - the index file contains a list of layer files, so it must not be uploaded
-//!    until all the layer files that are in its list have been successfully uploaded.
-//! - objects must be removed from the index before being deleted, and that updated
-//!   index must be written to remote storage before deleting the objects from remote storage.
+//! deletions are performed in the right order. For example, the index file
+//! contains a list of layer files, so it must not be uploaded until all the
+//! layer files that are in its list have been successfully uploaded.
 //!
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
@@ -72,12 +70,10 @@
 //! correct order, and the client will parallelize the operations in a way that
 //! is safe.
 //!
-//! The caller should be careful with deletion, though:
-//! - they should not delete local files that have been scheduled for upload but
-//!   not yet finished uploading.  Otherwise the upload will fail. To wait for an
-//!   upload to finish, use the 'wait_completion' function (more on that later.)
-//! - they should not to remote deletions via DeletionQueue without waiting for
-//!   the latest metadata to upload via RemoteTimelineClient.
+//! The caller should be careful with deletion, though. They should not delete
+//! local files that have been scheduled for upload but not yet finished uploading.
+//! Otherwise the upload will fail. To wait for an upload to finish, use
+//! the 'wait_completion' function (more on that later.)
 //!
 //! All of this relies on the following invariants:
 //!
@@ -204,11 +200,12 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

+mod delete;
 mod download;
 pub mod index;
 mod upload;

-use anyhow::{bail, Context};
+use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
@@ -229,7 +226,6 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

-use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -238,6 +234,8 @@ use crate::metrics::{
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::upload_queue::Delete;
+use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -247,7 +245,6 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    tenant::TIMELINES_SEGMENT_NAME,
 };

 use utils::id::{TenantId, TimelineId};
@@ -345,7 +342,12 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
+            runtime: if cfg!(test) {
+                // remote_timeline_client.rs tests rely on current-thread runtime
+                tokio::runtime::Handle::current()
+            } else {
+                BACKGROUND_RUNTIME.handle().clone()
+            },
            tenant_id,
            timeline_id,
            generation,
@@ -457,7 +459,6 @@ impl RemoteTimelineClient {
        );

        let index_part = download::download_index_part(
-            self.conf,
            &self.storage_impl,
            &self.tenant_id,
            &self.timeline_id,
@@ -640,36 +641,44 @@ impl RemoteTimelineClient {
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
-    pub async fn schedule_layer_file_deletion(
+    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
-        deletion_queue_client: &DeletionQueueClient,
    ) -> anyhow::Result<()> {
-        // Synchronous update of upload queues under mutex
-        let with_generations = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;

-            // Deleting layers doesn't affect the values stored in TimelineMetadata,
-            // so we don't need update it. Just serialize it.
-            let metadata = upload_queue.latest_metadata.clone();
+        // Deleting layers doesn't affect the values stored in TimelineMetadata,
+        // so we don't need update it. Just serialize it.
+        let metadata = upload_queue.latest_metadata.clone();

+        // Update the remote index file, removing the to-be-deleted files from the index,
+        // before deleting the actual files.
+        //
+        // Once we start removing files from upload_queue.latest_files, there's
+        // no going back! Otherwise, some of the files would already be removed
+        // from latest_files, but not yet scheduled for deletion. Use a closure
+        // to syntactically forbid ? or bail! calls here.
+        let no_bail_here = || {
            // Decorate our list of names with each name's generation, dropping
            // makes that are unexpectedly missing from our metadata.
            let with_generations: Vec<_> = names
-                .into_iter()
+                .iter()
                .filter_map(|name| {
                    // Remove from latest_files, learning the file's remote generation in the process
                    let meta = upload_queue.latest_files.remove(name);

                    if let Some(meta) = meta {
                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                        Some((name.clone(), meta.generation))
+                        Some((name, meta.generation))
                    } else {
-                        // This is unexpected: latest_files is meant to be kept up to
-                        // date.  We can't delete the layer if we have forgotten what
-                        // generation it was in.
-                        warn!("Deleting layer {name} not found in latest_files list");
+                        // This can only happen if we forgot to to schedule the file upload
+                        // before scheduling the delete. Log it because it is a rare/strange
+                        // situation, and in case something is misbehaving, we'd like to know which
+                        // layers experienced this.
+                        info!(
+                            "Deleting layer {name} not found in latest_files list, never uploaded?"
+                        );
                        None
                    }
                })
@@ -679,27 +688,23 @@ impl RemoteTimelineClient {
                self.schedule_index_upload(upload_queue, metadata);
            }

-            with_generations
+            // schedule the actual deletions
+            for (name, generation) in with_generations {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: false,
+                    generation,
+                });
+                self.calls_unfinished_metric_begin(&op);
+                upload_queue.queued_operations.push_back(op);
+                info!("scheduled layer file deletion {name}");
+            }
+
+            // Launch the tasks immediately, if possible
+            self.launch_queued_tasks(upload_queue);
        };
-
-        // Barrier: we must ensure all prior uploads and index writes have landed in S3
-        // before emitting deletions.
-        if let Err(e) = self.wait_completion().await {
-            // This can only fail if upload queue is shut down: if this happens, we do
-            // not emit any deletions.  In this condition (remote client is shut down
-            // during compaction or GC) we may leak some objects.
-            bail!("Cannot complete layer file deletions during shutdown ({e})");
-        }
-
-        // Enqueue deletions
-        deletion_queue_client
-            .push_layers(
-                self.tenant_id,
-                self.timeline_id,
-                self.generation,
-                with_generations,
-            )
-            .await?;
+        no_bail_here();
        Ok(())
    }

@@ -825,13 +830,12 @@ impl RemoteTimelineClient {
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
-    pub(crate) async fn delete_all(
-        self: &Arc<Self>,
-        deletion_queue: &DeletionQueueClient,
-    ) -> anyhow::Result<()> {
+    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let layers: Vec<_> = {
+        let (mut receiver, deletions_queued) = {
+            let mut deletions_queued = 0;
+
            let mut locked = self.upload_queue.lock().unwrap();
            let stopped = locked.stopped_mut()?;

@@ -843,30 +847,42 @@ impl RemoteTimelineClient {

            stopped
                .upload_queue_for_deletion
-                .latest_files
-                .drain()
-                .map(|kv| (kv.0, kv.1.generation))
-                .collect()
+                .queued_operations
+                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
+
+            // schedule the actual deletions
+            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: true,
+                    generation: meta.generation,
+                });
+
+                self.calls_unfinished_metric_begin(&op);
+                stopped
+                    .upload_queue_for_deletion
+                    .queued_operations
+                    .push_back(op);
+
+                info!("scheduled layer file deletion {name}");
+                deletions_queued += 1;
+            }
+
+            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
+
+            (
+                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
+                deletions_queued,
+            )
        };

-        let layer_deletion_count = layers.len();
-
-        let layer_paths = layers
-            .into_iter()
-            .map(|(layer, generation)| {
-                remote_layer_path(&self.tenant_id, &self.timeline_id, &layer, generation)
-            })
-            .collect();
-        deletion_queue.push_immediate(layer_paths).await?;
+        receiver.changed().await.context("upload queue shut down")?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);

-        // Execute all pending deletions, so that when we prroceed to do a list_prefixes below, we aren't
-        // taking the burden of listing all the layers that we already know we should delete.
-        deletion_queue.flush_immediate().await?;
-
        let remaining = backoff::retry(
            || async {
                self.storage_impl
@@ -894,9 +910,17 @@ impl RemoteTimelineClient {
            })
            .collect();

-        let not_referenced_count = remaining.len();
        if !remaining.is_empty() {
-            deletion_queue.push_immediate(remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
+            )
+            .await
+            .context("delete_objects")?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -907,14 +931,18 @@ impl RemoteTimelineClient {

        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

-        debug!("enqueuing index part deletion");
-        deletion_queue
-            .push_immediate([index_file_path].to_vec())
-            .await?;
+        debug!("deleting index part");

-        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
-        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        deletion_queue.flush_immediate().await?;
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
+        )
+        .await
+        .context("delete_index")?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -922,7 +950,7 @@ impl RemoteTimelineClient {
            ))?
        });

-        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
+        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");

        Ok(())
    }
@@ -945,6 +973,10 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
+                UploadOp::Delete(_) => {
+                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
+                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
+                }

                UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
            };
@@ -972,6 +1004,9 @@ impl RemoteTimelineClient {
                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
+                UploadOp::Delete(_) => {
+                    upload_queue.num_inprogress_deletions += 1;
+                }
                UploadOp::Barrier(sender) => {
                    sender.send_replace(());
                    continue;
@@ -1105,6 +1140,21 @@ impl RemoteTimelineClient {
                    }
                    res
                }
+                UploadOp::Delete(delete) => {
+                    let path = &self
+                        .conf
+                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .join(delete.layer_file_name.file_name());
+                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
+                        .measure_remote_op(
+                            self.tenant_id,
+                            self.timeline_id,
+                            delete.file_kind,
+                            RemoteOpKind::Delete,
+                            Arc::clone(&self.metrics),
+                        )
+                        .await
+                }
                UploadOp::Barrier(_) => {
                    // unreachable. Barrier operations are handled synchronously in
                    // launch_queued_tasks
@@ -1164,7 +1214,15 @@ impl RemoteTimelineClient {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_) => { None }
+                UploadQueue::Stopped(stopped) => {
+                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
+                    // then stop() took care of it so we just return.
+                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
+                    match &task.op {
+                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
+                        _ => None
+                    }
+                },
                UploadQueue::Initialized(qi) => { Some(qi) }
            };

@@ -1186,6 +1244,9 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
+                UploadOp::Delete(_) => {
+                    upload_queue.num_inprogress_deletions -= 1;
+                }
                UploadOp::Barrier(_) => unreachable!(),
            };

@@ -1217,6 +1278,13 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
+            UploadOp::Delete(delete) => (
+                delete.file_kind,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
            UploadOp::Barrier(_) => {
                // we do not account these
                return None;
@@ -1276,6 +1344,7 @@ impl RemoteTimelineClient {
                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
                        num_inprogress_layer_uploads: 0,
                        num_inprogress_metadata_uploads: 0,
+                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                    };
@@ -1296,7 +1365,9 @@ impl RemoteTimelineClient {

                // consistency check
                assert_eq!(
-                    qi.num_inprogress_layer_uploads + qi.num_inprogress_metadata_uploads,
+                    qi.num_inprogress_layer_uploads
+                        + qi.num_inprogress_metadata_uploads
+                        + qi.num_inprogress_deletions,
                    qi.inprogress_tasks.len()
                );

@@ -1334,13 +1405,13 @@ pub fn remote_layer_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    layer_file_name: &LayerFileName,
-    generation: Generation,
+    layer_meta: &LayerFileMetadata,
 ) -> RemotePath {
    // Generation-aware key format
    let path = format!(
        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
        layer_file_name.file_name(),
-        generation.get_suffix()
+        layer_meta.generation.get_suffix()
    );

    RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1359,6 +1430,30 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

+/// Given the key of an index, parse out the generation part of the name
+pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
+    let file_name = match path.get_path().file_name() {
+        Some(f) => f,
+        None => {
+            // Unexpected: we should be seeing index_part.json paths only
+            tracing::warn!("Malformed index key {}", path);
+            return None;
+        }
+    };
+
+    let file_name_str = match file_name.to_str() {
+        Some(s) => s,
+        None => {
+            tracing::warn!("Malformed index key {:?}", path);
+            return None;
+        }
+    };
+    match file_name_str.split_once('-') {
+        Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix),
+        None => None,
+    }
+}
+
 /// Files on the remote storage are stored with paths, relative to the workdir.
 /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
 ///
@@ -1366,25 +1461,21 @@ pub fn remote_index_path(
 pub fn remote_path(
    conf: &PageServerConf,
    local_path: &Path,
-    generation: Option<Generation>,
+    generation: Generation,
 ) -> anyhow::Result<RemotePath> {
    let stripped = local_path
        .strip_prefix(&conf.workdir)
        .context("Failed to strip workdir prefix")?;

-    let suffixed = if let Some(generation) = generation {
-        format!(
-            "{0}{1}",
-            stripped.to_string_lossy(),
-            generation.get_suffix()
-        )
-    } else {
-        stripped.to_string_lossy().to_string()
-    };
+    let suffixed = format!(
+        "{0}{1}",
+        stripped.to_string_lossy(),
+        generation.get_suffix()
+    );

    RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
        format!(
-            "Failed to resolve remote part of path {:?} for base {:?}",
+            "to resolve remote part of path {:?} for base {:?}",
            local_path, conf.workdir
        )
    })
@@ -1395,18 +1486,14 @@ mod tests {
    use super::*;
    use crate::{
        context::RequestContext,
-        deletion_queue::mock::MockDeletionQueue,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
-    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use std::{
-        collections::HashSet,
-        path::{Path, PathBuf},
-    };
+
+    use std::{collections::HashSet, path::Path};
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1463,9 +1550,6 @@ mod tests {
        tenant: Arc<Tenant>,
        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
-        remote_fs_dir: PathBuf,
-        client: Arc<RemoteTimelineClient>,
-        deletion_queue: MockDeletionQueue,
    }

    impl TestSetup {
@@ -1475,57 +1559,44 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = harness.load().await;

-            // create an empty timeline directory
            let timeline = tenant
                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
                .await?;

-            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-            std::fs::create_dir_all(remote_fs_dir)?;
-            let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-
-            let storage_config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(
-                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-                )
-                .unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(
-                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-                )
-                .unwrap(),
-                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-            };
-
-            let generation = Generation::new(0xdeadbeef);
-
-            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
-
-            let client = Arc::new(RemoteTimelineClient {
-                conf: harness.conf,
-                runtime: tokio::runtime::Handle::current(),
-                tenant_id: harness.tenant_id,
-                timeline_id: TIMELINE_ID,
-                generation,
-                storage_impl: storage.clone(),
-                upload_queue: Mutex::new(UploadQueue::Uninitialized),
-                metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                    &harness.tenant_id,
-                    &TIMELINE_ID,
-                )),
-            });
-
-            let deletion_queue = MockDeletionQueue::new(Some(storage));
-
            Ok(Self {
                harness,
                tenant,
                timeline,
                tenant_ctx: ctx,
-                remote_fs_dir,
-                client,
-                deletion_queue,
            })
        }
+
+        /// Construct a RemoteTimelineClient in an arbitrary generation
+        fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
+            Arc::new(RemoteTimelineClient {
+                conf: self.harness.conf,
+                runtime: tokio::runtime::Handle::current(),
+                tenant_id: self.harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                generation,
+                storage_impl: self.harness.remote_storage.clone(),
+                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &self.harness.tenant_id,
+                    &TIMELINE_ID,
+                )),
+            })
+        }
+
+        /// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id
+        /// and timeline_id are present.
+        fn span(&self) -> tracing::Span {
+            tracing::info_span!(
+                "test",
+                tenant_id = %self.harness.tenant_id,
+                timeline_id = %TIMELINE_ID
+            )
+        }
    }

    // Test scheduling
@@ -1545,30 +1616,44 @@ mod tests {
        // Schedule another deletion. Check that it's launched immediately.
        // Schedule index upload. Check that it's queued

+        let test_setup = TestSetup::new("upload_scheduling").await.unwrap();
+        let span = test_setup.span();
+        let _guard = span.enter();
+
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline: _timeline,
+            timeline,
            tenant_ctx: _tenant_ctx,
-            remote_fs_dir,
-            client,
-            deletion_queue,
-        } = TestSetup::new("upload_scheduling").await.unwrap();
+        } = test_setup;
+
+        let client = timeline.remote_client.as_ref().unwrap();
+
+        // Download back the index.json, and check that the list of files is correct
+        let initial_index_part = match client.download_index_file().await.unwrap() {
+            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
+        };
+        let initial_layers = initial_index_part
+            .layer_metadata
+            .keys()
+            .map(|f| f.to_owned())
+            .collect::<HashSet<LayerFileName>>();
+        let initial_layer = {
+            assert!(initial_layers.len() == 1);
+            initial_layers.into_iter().next().unwrap()
+        };

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

-        let remote_timeline_dir =
-            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
+        let remote_timeline_dir = harness
+            .remote_fs_dir
+            .join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

-        let metadata = dummy_metadata(Lsn(0x10));
-        client
-            .init_upload_queue_for_empty_remote(&metadata)
-            .unwrap();
-
-        let generation = Generation::new(0xdeadbeef);
+        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1649,6 +1734,7 @@ mod tests {
                .map(|f| f.to_owned())
                .collect(),
            &[
+                &initial_layer.file_name(),
                &layer_file_name_1.file_name(),
                &layer_file_name_2.file_name(),
            ],
@@ -1662,68 +1748,37 @@ mod tests {
                &LayerFileMetadata::new(content_3.len() as u64, generation),
            )
            .unwrap();
-
-        {
-            let mut guard = client.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut().unwrap();
-            assert_eq!(upload_queue.queued_operations.len(), 0);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-        }
-
-        assert_remote_files(
-            &[
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
-                "index_part.json",
-            ],
-            &remote_timeline_dir,
-            generation,
-        );
-
        client
-            .schedule_layer_file_deletion(
-                &[layer_file_name_1.clone()],
-                &deletion_queue.new_client(),
-            )
-            .await
+            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
            .unwrap();
-
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

-            // Deletion schedules upload of the index file via RemoteTimelineClient, and
-            // deletion of layer files via DeletionQueue.  The uploads have all been flushed
-            // because schedule_layer_file_deletion does a wait_completion before pushing
-            // to the deletion_queue
-            assert_eq!(upload_queue.queued_operations.len(), 0);
-            assert_eq!(upload_queue.inprogress_tasks.len(), 0);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 0);
-            assert_eq!(
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
-                0
-            );
+            // Deletion schedules upload of the index file, and the file deletion itself
+            assert!(upload_queue.queued_operations.len() == 2);
+            assert!(upload_queue.inprogress_tasks.len() == 1);
+            assert!(upload_queue.num_inprogress_layer_uploads == 1);
+            assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
        }
        assert_remote_files(
            &[
+                &initial_layer.file_name(),
                &layer_file_name_1.file_name(),
                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
            generation,
        );

-        // Finish uploads and deletions
+        // Finish them
        client.wait_completion().await.unwrap();
-        deletion_queue.pump().await;
-
-        // 1 layer was deleted
-        assert_eq!(deletion_queue.get_executed(), 1);

        assert_remote_files(
            &[
+                &initial_layer.file_name(),
                &layer_file_name_2.file_name(),
                &layer_file_name_3.file_name(),
                "index_part.json",
@@ -1740,16 +1795,10 @@ mod tests {
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline: _timeline,
-            client,
+            timeline,
            ..
        } = TestSetup::new("metrics").await.unwrap();
-
-        let metadata = dummy_metadata(Lsn(0x10));
-        client
-            .init_upload_queue_for_empty_remote(&metadata)
-            .unwrap();
-
+        let client = timeline.remote_client.as_ref().unwrap();
        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1760,11 +1809,20 @@ mod tests {
        )
        .unwrap();

-        #[derive(Debug, PartialEq)]
+        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
            finished: Option<usize>,
        }
+        impl std::ops::Add for BytesStartedFinished {
+            type Output = Self;
+            fn add(self, rhs: Self) -> Self::Output {
+                Self {
+                    started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
+                    finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
+                }
+            }
+        }
        let get_bytes_started_stopped = || {
            let started = client
                .metrics
@@ -1781,66 +1839,140 @@ mod tests {
        };

        // Test
+        tracing::info!("now doing actual test");

-        let generation = Generation::new(0xdeadbeef);
-
-        let init = get_bytes_started_stopped();
+        let actual_a = get_bytes_started_stopped();

        client
            .schedule_layer_file_upload(
                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, generation),
+                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
            )
            .unwrap();

-        let pre = get_bytes_started_stopped();
+        let actual_b = get_bytes_started_stopped();

        client.wait_completion().await.unwrap();

-        let post = get_bytes_started_stopped();
+        let actual_c = get_bytes_started_stopped();

        // Validate

-        assert_eq!(
-            init,
-            BytesStartedFinished {
-                started: None,
-                finished: None
-            }
-        );
-        assert_eq!(
-            pre,
-            BytesStartedFinished {
+        let expected_b = actual_a
+            + BytesStartedFinished {
                started: Some(content_1.len()),
                // assert that the _finished metric is created eagerly so that subtractions work on first sample
                finished: Some(0),
-            }
-        );
-        assert_eq!(
-            post,
-            BytesStartedFinished {
+            };
+        assert_eq!(actual_b, expected_b);
+
+        let expected_c = actual_a
+            + BytesStartedFinished {
                started: Some(content_1.len()),
-                finished: Some(content_1.len())
-            }
-        );
+                finished: Some(content_1.len()),
+            };
+        assert_eq!(actual_c, expected_c);
    }

-    // #[tokio::test]
-    // async fn index_part_download() {
-    //     let TestSetup {
-    //         harness,
-    //         tenant: _tenant,
-    //         timeline: _timeline,
-    //         client,
-    //         ..
-    //     } = TestSetup::new("index_part_download").await.unwrap();
+    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
+        // An empty IndexPart, just sufficient to ensure deserialization will succeed
+        let example_metadata = TimelineMetadata::example();
+        let example_index_part = IndexPart::new(
+            HashMap::new(),
+            example_metadata.disk_consistent_lsn(),
+            example_metadata,
+        );

-    //     let example_index_part = IndexPart {
-    //         version: 3,
-    //         timeline_layers: HashSet::new(),
-    //         layer_metadata:
+        let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();

-    //     }
+        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
+        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
+            timeline_path
+                .strip_prefix(&test_state.harness.conf.workdir)
+                .unwrap(),
+        );

-    // }
+        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
+
+        let index_path = test_state.harness.remote_fs_dir.join(
+            remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
+        );
+        eprintln!("Writing {}", index_path.display());
+        std::fs::write(&index_path, index_part_bytes).unwrap();
+        example_index_part
+    }
+
+    /// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its
+    /// index, the IndexPart returned is equal to `expected`
+    async fn assert_got_index_part(
+        test_state: &TestSetup,
+        get_generation: Generation,
+        expected: &IndexPart,
+    ) {
+        let client = test_state.build_client(get_generation);
+
+        let download_r = client
+            .download_index_file()
+            .await
+            .expect("download should always succeed");
+        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
+        match download_r {
+            MaybeDeletedIndexPart::IndexPart(index_part) => {
+                assert_eq!(&index_part, expected);
+            }
+            MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"),
+        }
+    }
+
+    #[tokio::test]
+    async fn index_part_download_simple() -> anyhow::Result<()> {
+        let test_state = TestSetup::new("index_part_download_simple").await.unwrap();
+        let span = test_state.span();
+        let _guard = span.enter();
+
+        // Simple case: we are in generation N, load the index from generation N - 1
+        let generation_n = 5;
+        let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
+
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn index_part_download_ordering() -> anyhow::Result<()> {
+        let test_state = TestSetup::new("index_part_download_ordering")
+            .await
+            .unwrap();
+
+        let span = test_state.span();
+        let _guard = span.enter();
+
+        // A generation-less IndexPart exists in the bucket, we should find it
+        let generation_n = 5;
+        let injected_none = inject_index_part(&test_state, Generation::none()).await;
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await;
+
+        // If a more recent-than-none generation exists, we should prefer to load that
+        let injected_1 = inject_index_part(&test_state, Generation::new(1)).await;
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
+
+        // If a more-recent-than-me generation exists, we should ignore it.
+        let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await;
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
+
+        // If a directly previous generation exists, _and_ an index exists in my own
+        // generation, I should prefer my own generation.
+        let _injected_prev =
+            inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
+        let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await;
+        assert_got_index_part(
+            &test_state,
+            Generation::new(generation_n),
+            &injected_current,
+        )
+        .await;
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ b/pageserver/src/tenant/remote_timeline_client/delete.rs
@@ -0,0 +1,34 @@
+//! Helper functions to delete files from remote storage with a RemoteStorage
+use anyhow::Context;
+use std::path::Path;
+use tracing::debug;
+
+use remote_storage::GenericRemoteStorage;
+
+use crate::{
+    config::PageServerConf,
+    tenant::{remote_timeline_client::remote_path, Generation},
+};
+
+pub(super) async fn delete_layer<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    local_layer_path: &'a Path,
+    generation: Generation,
+) -> anyhow::Result<()> {
+    fail::fail_point!("before-delete-layer", |_| {
+        anyhow::bail!("failpoint before-delete-layer")
+    });
+    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
+
+    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
+
+    // We don't want to print an error if the delete failed if the file has
+    // already been deleted. Thankfully, in this situation S3 already
+    // does not yield an error. While OS-provided local file system APIs do yield
+    // errors, we avoid them in the `LocalFs` wrapper.
+    storage
+        .delete(&path_to_delete)
+        .await
+        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
+}
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -19,12 +19,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
+use super::{
+    parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES,
+};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -47,12 +50,7 @@ pub async fn download_layer_file<'a>(
        .timeline_path(&tenant_id, &timeline_id)
        .join(layer_file_name.file_name());

-    let remote_path = remote_layer_path(
-        &tenant_id,
-        &timeline_id,
-        layer_file_name,
-        layer_metadata.generation,
-    );
+    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
@@ -69,33 +67,43 @@ pub async fn download_layer_file<'a>(
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            // TODO: this doesn't use the cached fd for some reason?
-            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
-                format!(
-                    "create a destination file for layer '{}'",
-                    temp_file_path.display()
-                )
-            })
-            .map_err(DownloadError::Other)?;
-            let mut download = storage.download(&remote_path).await.with_context(|| {
-                format!(
+            let mut destination_file = fs::File::create(&temp_file_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "create a destination file for layer '{}'",
+                        temp_file_path.display()
+                    )
+                })
+                .map_err(DownloadError::Other)?;
+            let mut download = storage
+                .download(&remote_path)
+                .await
+                .with_context(|| {
+                    format!(
                    "open a download stream for layer with remote storage path '{remote_path:?}'"
                )
-            })
-            .map_err(DownloadError::Other)?;
-
-            let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
-                .await
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
-                .with_context(|| {
-                    format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
                })
                .map_err(DownloadError::Other)?;

-            Ok((destination_file, bytes_amount))
+            let bytes_amount = tokio::time::timeout(
+                MAX_DOWNLOAD_DURATION,
+                tokio::io::copy(&mut download.download_stream, &mut destination_file),
+            )
+            .await
+            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+            .with_context(|| {
+                format!(
+                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
+                )
+            })
+            .map_err(DownloadError::Other)?;

+            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
-    ).await?;
+    )
+    .await?;

    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
    // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -108,12 +116,7 @@ pub async fn download_layer_file<'a>(
    destination_file
        .flush()
        .await
-        .with_context(|| {
-            format!(
-                "failed to flush source file at {}",
-                temp_file_path.display()
-            )
-        })
+        .with_context(|| format!("flush source file at {}", temp_file_path.display()))
        .map_err(DownloadError::Other)?;

    let expected = layer_metadata.file_size();
@@ -144,17 +147,12 @@ pub async fn download_layer_file<'a>(

    fs::rename(&temp_file_path, &local_path)
        .await
-        .with_context(|| {
-            format!(
-                "Could not rename download layer file to {}",
-                local_path.display(),
-            )
-        })
+        .with_context(|| format!("rename download layer file to {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

    crashsafe::fsync_async(&local_path)
        .await
-        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
+        .with_context(|| format!("fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

    tracing::debug!("download complete: {}", local_path.display());
@@ -205,9 +203,9 @@ pub async fn list_remote_timelines(
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        let timeline_id: TimelineId = object_name.parse().with_context(|| {
-            format!("failed to parse object name into timeline id '{object_name}'")
-        })?;
+        let timeline_id: TimelineId = object_name
+            .parse()
+            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;

        // list_prefixes is assumed to return unique names. Ensure this here.
        // NB: it's safer to bail out than warn-log this because the pageserver
@@ -225,7 +223,6 @@ pub async fn list_remote_timelines(
 }

 async fn do_download_index_part(
-    local_path: &Path,
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
@@ -234,83 +231,92 @@ async fn do_download_index_part(
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);

    let index_part_bytes = download_retry(
-        || storage.download_all(&remote_path),
+        || async {
+            let mut index_part_download = storage.download(&remote_path).await?;
+
+            let mut index_part_bytes = Vec::new();
+            tokio::io::copy(
+                &mut index_part_download.download_stream,
+                &mut index_part_bytes,
+            )
+            .await
+            .with_context(|| format!("download index part at {remote_path:?}"))
+            .map_err(DownloadError::Other)?;
+            Ok(index_part_bytes)
+        },
        &format!("download {remote_path:?}"),
    )
    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("Failed to deserialize index part file into file {local_path:?}"))
+        .with_context(|| format!("download index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

    Ok(index_part)
 }

+/// index_part.json objects are suffixed with a generation number, so we cannot
+/// directly GET the latest index part without doing some probing.
+///
+/// In this function we probe for the most recent index in a generation <= our current generation.
+/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+#[tracing::instrument(skip_all, fields(generation=?my_generation))]
 pub(super) async fn download_index_part(
-    conf: &'static PageServerConf,
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
 ) -> Result<IndexPart, DownloadError> {
-    let local_path = conf
-        .metadata_path(tenant_id, timeline_id)
-        .with_file_name(IndexPart::FILE_NAME);
+    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(&local_path, storage, tenant_id, timeline_id, my_generation)
-            .await;
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    }

-    let previous_gen = my_generation.previous();
-    let r_previous =
-        do_download_index_part(&local_path, storage, tenant_id, timeline_id, previous_gen).await;
-
-    match r_previous {
+    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
+    // index in our generation.
+    //
+    // This is an optimization to avoid doing the listing for the general case below.
+    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
+    match res {
        Ok(index_part) => {
-            tracing::debug!("Found index_part from previous generation {previous_gen}");
+            tracing::debug!(
+                "Found index_part from current generation (this is a stale attachment)"
+            );
            return Ok(index_part);
        }
-        Err(e) => {
-            if matches!(e, DownloadError::NotFound) {
-                tracing::debug!("No index_part found from previous generation {previous_gen}, falling back to listing");
-            } else {
-                return Err(e);
-            }
-        }
+        Err(DownloadError::NotFound) => {}
+        Err(e) => return Err(e),
    };

-    /// Given the key of an index, parse out the generation part of the name
-    fn parse_generation(path: RemotePath) -> Option<Generation> {
-        let path = path.take();
-        let file_name = match path.file_name() {
-            Some(f) => f,
-            None => {
-                // Unexpected: we should be seeing index_part.json paths only
-                tracing::warn!("Malformed index key {0}", path.display());
-                return None;
-            }
-        };
-
-        let file_name_str = match file_name.to_str() {
-            Some(s) => s,
-            None => {
-                tracing::warn!("Malformed index key {0}", path.display());
-                return None;
-            }
-        };
-
-        match file_name_str.split_once("-") {
-            Some((_, gen_suffix)) => u32::from_str_radix(gen_suffix, 16)
-                .map(|g| Generation::new(g))
-                .ok(),
-            None => None,
+    // Typical case: the previous generation of this tenant was running healthily, and had uploaded
+    // and index part.  We may safely start from this index without doing a listing, because:
+    //  - We checked for current generation case above
+    //  - generations > my_generation are to be ignored
+    //  - any other indices that exist would have an older generation than `previous_gen`, and
+    //    we want to find the most recent index from a previous generation.
+    //
+    // This is an optimization to avoid doing the listing for the general case below.
+    let res =
+        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
+    match res {
+        Ok(index_part) => {
+            tracing::debug!("Found index_part from previous generation");
+            return Ok(index_part);
+        }
+        Err(DownloadError::NotFound) => {
+            tracing::debug!(
+                "No index_part found from previous generation, falling back to listing"
+            );
+        }
+        Err(e) => {
+            return Err(e);
        }
    }

-    // Fallback: we did not find an index_part.json from the previous generation, so
-    // we will list all the index_part objects and pick the most recent.
+    // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
+    // objects, and select the highest one with a generation <= my_generation.
    let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
    let indices = backoff::retry(
        || async { storage.list_files(Some(&index_prefix)).await },
@@ -324,38 +330,26 @@ pub(super) async fn download_index_part(
        }),
    )
    .await
-    .map_err(|e| DownloadError::Other(e))?;
+    .map_err(DownloadError::Other)?;

-    let mut generations: Vec<_> = indices
+    // General case logic for which index to use: the latest index whose generation
+    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+    let max_previous_generation = indices
        .into_iter()
-        .filter_map(|k| parse_generation(k))
+        .filter_map(parse_remote_index_path)
        .filter(|g| g <= &my_generation)
-        .collect();
+        .max();

-    generations.sort();
-    match generations.last() {
+    match max_previous_generation {
        Some(g) => {
-            tracing::debug!("Found index_part in generation {g} (my generation {my_generation})");
-            do_download_index_part(&local_path, storage, tenant_id, timeline_id, *g).await
+            tracing::debug!("Found index_part in generation {g:?}");
+            do_download_index_part(storage, tenant_id, timeline_id, g).await
        }
        None => {
-            // This is not an error: the timeline may be newly created, or we may be
-            // upgrading and have no historical index_part with a generation suffix.
-            // Fall back to trying to load the un-suffixed index_part.json.
-            tracing::info!(
-                "No index_part.json-* found when loading {}/{} in generation {}",
-                tenant_id,
-                timeline_id,
-                my_generation
-            );
-            return do_download_index_part(
-                &local_path,
-                storage,
-                tenant_id,
-                timeline_id,
-                Generation::none(),
-            )
-            .await;
+            // Migration from legacy pre-generation state: we have a generation but no prior
+            // attached pageservers did.  Try to load from a no-generation path.
+            tracing::info!("No index_part.json* found");
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
        }
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -2,7 +2,7 @@
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
@@ -69,10 +69,6 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

-    /// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
-    #[serde(default, skip_deserializing)]
-    timeline_layers: HashSet<LayerFileName>,
-
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -98,7 +94,12 @@ impl IndexPart {
    /// - 2: added `deleted_at`
    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
    ///      is always generated from the keys of `layer_metadata`)
-    const LATEST_VERSION: usize = 3;
+    /// - 4: timeline_layers is fully removed.
+    const LATEST_VERSION: usize = 4;
+
+    // Versions we may see when reading from a bucket.
+    pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];
+
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
@@ -106,24 +107,30 @@ impl IndexPart {
        disk_consistent_lsn: Lsn,
        metadata: TimelineMetadata,
    ) -> Self {
-        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
-        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
-
-        for (remote_name, metadata) in &layers_and_metadata {
-            timeline_layers.insert(remote_name.to_owned());
-            let metadata = IndexLayerMetadata::from(metadata);
-            layer_metadata.insert(remote_name.to_owned(), metadata);
-        }
+        // Transform LayerFileMetadata into IndexLayerMetadata
+        let layer_metadata = layers_and_metadata
+            .into_iter()
+            .map(|(k, v)| (k, IndexLayerMetadata::from(v)))
+            .collect();

        Self {
            version: Self::LATEST_VERSION,
-            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
            metadata,
            deleted_at: None,
        }
    }
+
+    pub fn get_version(&self) -> usize {
+        self.version
+    }
+
+    /// If you want this under normal operations, read it from self.metadata:
+    /// this method is just for the scrubber to use when validating an index.
+    pub fn get_disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -144,15 +151,15 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexLayerMetadata {
-    pub(super) file_size: u64,
+    pub file_size: u64,

    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
    pub(super) generation: Generation,
 }

-impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: &'_ LayerFileMetadata) -> Self {
+impl From<LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: LayerFileMetadata) -> Self {
        IndexLayerMetadata {
            file_size: other.file_size,
            generation: other.generation,
@@ -180,7 +187,6 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -219,7 +225,6 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -259,7 +264,6 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -294,7 +298,6 @@ mod tests {

        let expected = IndexPart {
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[
@@ -327,4 +330,41 @@ mod tests {

        assert_eq!(empty_layers_parsed, expected);
    }
+
+    #[test]
+    fn v4_indexpart_is_parsed() {
+        let example = r#"{
+            "version":4,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            version: 4,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                    generation: Generation::none()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -31,8 +31,8 @@ pub(super) async fn upload_index_part<'a>(
        bail!("failpoint before-upload-index")
    });

-    let index_part_bytes = serde_json::to_vec(&index_part)
-        .context("Failed to serialize index part file into bytes")?;
+    let index_part_bytes =
+        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));

@@ -40,7 +40,7 @@ pub(super) async fn upload_index_part<'a>(
    storage
        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
        .await
-        .with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
+        .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -58,7 +58,7 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("failpoint before-upload-layer")
    });

-    let storage_path = remote_path(conf, source_path, Some(generation))?;
+    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
        Ok(source_file) => source_file,
@@ -71,16 +71,15 @@ pub(super) async fn upload_timeline_layer<'a>(
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
-        Err(e) => Err(e)
-            .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
+        Err(e) => {
+            Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
+        }
    };

    let fs_size = source_file
        .metadata()
        .await
-        .with_context(|| {
-            format!("Failed to get the source file metadata for layer {source_path:?}")
-        })?
+        .with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
        .len();

    let metadata_size = known_metadata.file_size();
@@ -88,19 +87,13 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
    }

-    let fs_size = usize::try_from(fs_size).with_context(|| {
-        format!("File {source_path:?} size {fs_size} could not be converted to usize")
-    })?;
+    let fs_size = usize::try_from(fs_size)
+        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;

    storage
        .upload(source_file, fs_size, &storage_path, None)
        .await
-        .with_context(|| {
-            format!(
-                "Failed to upload a layer from local path '{}'",
-                source_path.display()
-            )
-        })?;
+        .with_context(|| format!("upload layer from local path '{}'", source_path.display()))?;

    Ok(())
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -45,8 +45,7 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::{self, File};
-use std::io::{BufWriter, Write};
-use std::io::{Seek, SeekFrom};
+use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -219,7 +218,7 @@ pub struct DeltaLayerInner {
    index_root_blk: u32,

    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    file: FileBlockReader,
 }

 impl AsRef<DeltaLayerInner> for DeltaLayerInner {
@@ -583,14 +582,14 @@ struct DeltaLayerWriterInner {

    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

-    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
+    blob_writer: BlobWriter<true>,
 }

 impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
-    fn new(
+    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -605,11 +604,10 @@ impl DeltaLayerWriterInner {
        // FIXME: throw an error instead?
        let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);

-        let mut file = VirtualFile::create(&path)?;
+        let mut file = VirtualFile::create(&path).await?;
        // make room for the header block
-        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
-        let buf_writer = BufWriter::new(file);
-        let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64);
+        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -632,11 +630,12 @@ impl DeltaLayerWriterInner {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
+            .await
    }

-    fn put_value_bytes(
+    async fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -645,7 +644,7 @@ impl DeltaLayerWriterInner {
    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);

-        let off = self.blob_writer.write_blob(val)?;
+        let off = self.blob_writer.write_blob(val).await?;

        let blob_ref = BlobRef::new(off, will_init);

@@ -662,18 +661,18 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

-        let buf_writer = self.blob_writer.into_inner();
-        let mut file = buf_writer.into_inner()?;
+        let mut file = self.blob_writer.into_inner().await?;

        // Write out the index
        let (index_root_blk, block_buf) = self.tree.finish()?;
-        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
+        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
+            .await?;
        for buf in block_buf.blocks {
-            file.write_all(buf.as_ref())?;
+            file.write_all(buf.as_ref()).await?;
        }
        assert!(self.lsn_range.start < self.lsn_range.end);
        // Fill in the summary on blk 0
@@ -687,11 +686,22 @@ impl DeltaLayerWriterInner {
            index_start_blk,
            index_root_blk,
        };
-        file.seek(SeekFrom::Start(0))?;
-        Summary::ser_into(&summary, &mut file)?;
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&summary, &mut buf)?;
+        if buf.spilled() {
+            // This is bad as we only have one free block for the summary
+            warn!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;

        let metadata = file
            .metadata()
+            .await
            .context("get file metadata to determine size")?;

        // 5GB limit for objects without multipart upload (which we don't want to use)
@@ -722,7 +732,7 @@ impl DeltaLayerWriterInner {
        };

        // fsync the file
-        file.sync_all()?;
+        file.sync_all().await?;
        // Rename the file to its final name
        //
        // Note: This overwrites any existing file. There shouldn't be any.
@@ -774,7 +784,7 @@ impl DeltaLayerWriter {
    ///
    /// Start building a new delta layer.
    ///
-    pub fn new(
+    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -782,13 +792,10 @@ impl DeltaLayerWriter {
        lsn_range: Range<Lsn>,
    ) -> anyhow::Result<Self> {
        Ok(Self {
-            inner: Some(DeltaLayerWriterInner::new(
-                conf,
-                timeline_id,
-                tenant_id,
-                key_start,
-                lsn_range,
-            )?),
+            inner: Some(
+                DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
+                    .await?,
+            ),
        })
    }

@@ -797,11 +804,11 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val)
+    pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val).await
    }

-    pub fn put_value_bytes(
+    pub async fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -812,6 +819,7 @@ impl DeltaLayerWriter {
            .as_mut()
            .unwrap()
            .put_value_bytes(key, lsn, val, will_init)
+            .await
    }

    pub fn size(&self) -> u64 {
@@ -821,21 +829,18 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end)
+    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end).await
    }
 }

 impl Drop for DeltaLayerWriter {
    fn drop(&mut self) {
        if let Some(inner) = self.inner.take() {
-            match inner.blob_writer.into_inner().into_inner() {
-                Ok(vfile) => vfile.remove(),
-                Err(err) => warn!(
-                    "error while flushing buffer of image layer temporary file: {}",
-                    err
-                ),
-            }
+            // We want to remove the virtual file here, so it's fine to not
+            // having completely flushed unwritten data.
+            let vfile = inner.blob_writer.into_inner_no_flush();
+            vfile.remove();
        }
    }
 }
@@ -846,6 +851,7 @@ impl DeltaLayerInner {
        summary: Option<Summary>,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
+            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -212,7 +212,7 @@ pub enum LayerFileName {
 }

 impl LayerFileName {
-    pub(crate) fn file_name(&self) -> String {
+    pub fn file_name(&self) -> String {
        self.to_string()
    }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -42,8 +42,7 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::{self, File};
-use std::io::Write;
-use std::io::{Seek, SeekFrom};
+use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
@@ -155,7 +154,7 @@ pub struct ImageLayerInner {
    lsn: Lsn,

    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    file: FileBlockReader,
 }

 impl std::fmt::Debug for ImageLayerInner {
@@ -439,6 +438,7 @@ impl ImageLayerInner {
        summary: Option<Summary>,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
+            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0).await?;
@@ -511,7 +511,7 @@ struct ImageLayerWriterInner {
    key_range: Range<Key>,
    lsn: Lsn,

-    blob_writer: WriteBlobWriter<VirtualFile>,
+    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

@@ -519,7 +519,7 @@ impl ImageLayerWriterInner {
    ///
    /// Start building a new image layer.
    ///
-    fn new(
+    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -541,10 +541,11 @@ impl ImageLayerWriterInner {
        let mut file = VirtualFile::open_with_options(
            &path,
            std::fs::OpenOptions::new().write(true).create_new(true),
-        )?;
+        )
+        .await?;
        // make room for the header block
-        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
-        let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
+        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -569,9 +570,9 @@ impl ImageLayerWriterInner {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
-        let off = self.blob_writer.write_blob(img)?;
+        let off = self.blob_writer.write_blob(img).await?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
@@ -583,17 +584,18 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    fn finish(self) -> anyhow::Result<ImageLayer> {
+    async fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

        let mut file = self.blob_writer.into_inner();

        // Write out the index
-        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
+        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
+            .await?;
        let (index_root_blk, block_buf) = self.tree.finish()?;
        for buf in block_buf.blocks {
-            file.write_all(buf.as_ref())?;
+            file.write_all(buf.as_ref()).await?;
        }

        // Fill in the summary on blk 0
@@ -607,11 +609,22 @@ impl ImageLayerWriterInner {
            index_start_blk,
            index_root_blk,
        };
-        file.seek(SeekFrom::Start(0))?;
-        Summary::ser_into(&summary, &mut file)?;
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&summary, &mut buf)?;
+        if buf.spilled() {
+            // This is bad as we only have one free block for the summary
+            warn!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;

        let metadata = file
            .metadata()
+            .await
            .context("get metadata to determine file size")?;

        let desc = PersistentLayerDesc::new_img(
@@ -634,7 +647,7 @@ impl ImageLayerWriterInner {
        };

        // fsync the file
-        file.sync_all()?;
+        file.sync_all().await?;

        // Rename the file to its final name
        //
@@ -687,7 +700,7 @@ impl ImageLayerWriter {
    ///
    /// Start building a new image layer.
    ///
-    pub fn new(
+    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -695,13 +708,9 @@ impl ImageLayerWriter {
        lsn: Lsn,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
-            inner: Some(ImageLayerWriterInner::new(
-                conf,
-                timeline_id,
-                tenant_id,
-                key_range,
-                lsn,
-            )?),
+            inner: Some(
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
+            ),
        })
    }

@@ -710,15 +719,15 @@ impl ImageLayerWriter {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_image(key, img)
+    pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img).await
    }

    ///
    /// Finish writing the image layer.
    ///
-    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish()
+    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish().await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -236,7 +236,7 @@ impl InMemoryLayer {
    ///
    /// Create a new, empty, in-memory layer
    ///
-    pub fn create(
+    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -244,7 +244,7 @@ impl InMemoryLayer {
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;

        Ok(InMemoryLayer {
            conf,
@@ -333,7 +333,8 @@ impl InMemoryLayer {
            self.tenant_id,
            Key::MIN,
            self.start_lsn..end_lsn,
-        )?;
+        )
+        .await?;

        let mut buf = Vec::new();

@@ -348,11 +349,13 @@ impl InMemoryLayer {
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let will_init = Value::des(&buf)?.will_init();
-                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
+                delta_layer_writer
+                    .put_value_bytes(key, *lsn, &buf, will_init)
+                    .await?;
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -74,7 +74,7 @@ impl Layer for RemoteLayer {
        _reconstruct_state: &mut ValueReconstructState,
        _ctx: &RequestContext,
    ) -> Result<ValueReconstructResult> {
-        bail!("layer {self} needs to be downloaded");
+        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
    }
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -102,6 +102,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            let started_at = Instant::now();

            let sleep_duration = if period == Duration::ZERO {
+                #[cfg(not(feature = "testing"))]
                info!("automatic compaction is disabled");
                // check again in 10 seconds, in case it's been enabled again.
                Duration::from_secs(10)
@@ -166,6 +167,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            let gc_horizon = tenant.get_gc_horizon();
            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
+                #[cfg(not(feature = "testing"))]
                info!("automatic GC is disabled");
                // check again in 10 seconds, in case it's been enabled again.
                Duration::from_secs(10)
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,7 +38,6 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -91,6 +90,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
+use super::debug_assert_current_span_has_tenant_and_timeline_id;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
@@ -143,7 +143,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: Option<RemoteTimelineClient>,
-    pub deletion_queue_client: Option<DeletionQueueClient>,
 }

 pub struct Timeline {
@@ -155,7 +154,8 @@ pub struct Timeline {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,

-    // The generation of the tenant that instantiated us: this is used for safety when writing remote objects
+    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
+    /// Never changes for the lifetime of this [`Timeline`] object.
    generation: Generation,

    pub pg_version: u32,
@@ -201,9 +201,6 @@ pub struct Timeline {
    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
    pub remote_client: Option<Arc<RemoteTimelineClient>>,

-    /// Deletion queue: a global queue, separate to the remote storage queue's
-    deletion_queue_client: Option<Arc<DeletionQueueClient>>,
-
    // What page versions do we hold in the repository? If we get a
    // request > last_record_lsn, we need to wait until we receive all
    // the WAL up to the request. The SeqWait provides functions for
@@ -588,15 +585,7 @@ impl Timeline {
            Err(e) => {
                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
                drop(_timer);
-                let walreceiver_status = {
-                    match &*self.walreceiver.lock().unwrap() {
-                        None => "stopping or stopped".to_string(),
-                        Some(walreceiver) => match walreceiver.status() {
-                            Some(status) => status.to_human_readable_string(),
-                            None => "Not active".to_string(),
-                        },
-                    }
-                };
+                let walreceiver_status = self.walreceiver_status();
                Err(anyhow::Error::new(e).context({
                    format!(
                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
@@ -610,6 +599,16 @@ impl Timeline {
        }
    }

+    pub(crate) fn walreceiver_status(&self) -> String {
+        match &*self.walreceiver.lock().unwrap() {
+            None => "stopping or stopped".to_string(),
+            Some(walreceiver) => match walreceiver.status() {
+                Some(status) => status.to_human_readable_string(),
+                None => "Not active".to_string(),
+            },
+        }
+    }
+
    /// Check that it is valid to request operations with that lsn.
    pub fn check_lsn_is_in_scope(
        &self,
@@ -937,6 +936,48 @@ impl Timeline {
        self.launch_eviction_task(background_jobs_can_start);
    }

+    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
+    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // prevent writes to the InMemoryLayer
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+        )
+        .await;
+
+        // now all writers to InMemory layer are gone, do the final flush if requested
+        if freeze_and_flush {
+            match self.freeze_and_flush().await {
+                Ok(()) => {}
+                Err(e) => {
+                    warn!("failed to freeze and flush: {e:#}");
+                    return; // TODO: should probably drain remote timeline client anyways?
+                }
+            }
+
+            // drain the upload queue
+            let res = if let Some(client) = self.remote_client.as_ref() {
+                // if we did not wait for completion here, it might be our shutdown process
+                // didn't wait for remote uploads to complete at all, as new tasks can forever
+                // be spawned.
+                //
+                // what is problematic is the shutting down of RemoteTimelineClient, because
+                // obviously it does not make sense to stop while we wait for it, but what
+                // about corner cases like s3 suddenly hanging up?
+                client.wait_completion().await
+            } else {
+                Ok(())
+            };
+
+            if let Err(e) = res {
+                warn!("failed to await for frozen and flushed uploads: {e:#}");
+            }
+        }
+    }
+
    pub fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
@@ -1271,18 +1312,6 @@ impl Timeline {

        Ok(())
    }
-
-    async fn delete_all_remote(&self) -> anyhow::Result<()> {
-        if let Some(remote_client) = &self.remote_client {
-            if let Some(deletion_queue_client) = &self.deletion_queue_client {
-                remote_client.delete_all(deletion_queue_client).await
-            } else {
-                Ok(())
-            }
-        } else {
-            Ok(())
-        }
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1437,7 +1466,6 @@ impl Timeline {
                walreceiver: Mutex::new(None),

                remote_client: resources.remote_client.map(Arc::new),
-                deletion_queue_client: resources.deletion_queue_client.map(Arc::new),

                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                last_record_lsn: SeqWait::new(RecordLsn {
@@ -1698,11 +1726,18 @@ impl Timeline {
                for (name, decision) in decided {
                    let decision = match decision {
                        Ok(UseRemote { local, remote }) => {
-                            path.push(name.file_name());
-                            init::cleanup_local_file_for_remote(&path, &local, &remote)?;
-                            path.pop();
-
-                            UseRemote { local, remote }
+                            // Remote is authoritative, but we may still choose to retain
+                            // the local file if the contents appear to match
+                            if local.file_size() == remote.file_size() {
+                                // Use the local file, but take the remote metadata so that we pick up
+                                // the correct generation.
+                                UseLocal(remote)
+                            } else {
+                                path.push(name.file_name());
+                                init::cleanup_local_file_for_remote(&path, &local, &remote)?;
+                                path.pop();
+                                UseRemote { local, remote }
+                            }
                        }
                        Ok(decision) => decision,
                        Err(FutureLayer { local }) => {
@@ -1781,15 +1816,11 @@ impl Timeline {
        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);

        if let Some(rtc) = self.remote_client.as_ref() {
-            // Deletion queue client is always Some if remote_client is Some
-            let deletion_queue_client = self.deletion_queue_client.as_ref().unwrap();
-
            let (needs_upload, needs_cleanup) = to_sync;
            for (layer, m) in needs_upload {
                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
            }
-            rtc.schedule_layer_file_deletion(&needs_cleanup, deletion_queue_client)
-                .await?;
+            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
            rtc.schedule_index_upload_for_file_changes()?;
            // Tenant::create_timeline will wait for these uploads to happen before returning, or
            // on retry.
@@ -2515,13 +2546,15 @@ impl Timeline {
    ///
    async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
        let mut guard = self.layers.write().await;
-        let layer = guard.get_layer_for_write(
-            lsn,
-            self.get_last_record_lsn(),
-            self.conf,
-            self.timeline_id,
-            self.tenant_id,
-        )?;
+        let layer = guard
+            .get_layer_for_write(
+                lsn,
+                self.get_last_record_lsn(),
+                self.conf,
+                self.timeline_id,
+                self.tenant_id,
+            )
+            .await?;
        Ok(layer)
    }

@@ -2725,9 +2758,7 @@ impl Timeline {

                // update metrics
                let sz = l.layer_desc().file_size;
-                self.metrics.resident_physical_size_gauge.add(sz);
-                self.metrics.num_persistent_files_created.inc_by(1);
-                self.metrics.persistent_bytes_written.inc_by(sz);
+                self.metrics.record_new_file_metrics(sz);
            }

            guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
@@ -2756,6 +2787,7 @@ impl Timeline {
        if disk_consistent_lsn != old_disk_consistent_lsn {
            assert!(disk_consistent_lsn > old_disk_consistent_lsn);
            self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)
+                .await
                .context("update_metadata_file")?;
            // Also update the in-memory copy
            self.disk_consistent_lsn.store(disk_consistent_lsn);
@@ -2764,7 +2796,7 @@ impl Timeline {
    }

    /// Update metadata file
-    fn update_metadata_file(
+    async fn update_metadata_file(
        &self,
        disk_consistent_lsn: Lsn,
        layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
@@ -2805,14 +2837,9 @@ impl Timeline {
            x.unwrap()
        ));

-        save_metadata(
-            self.conf,
-            &self.tenant_id,
-            &self.timeline_id,
-            &metadata,
-            false,
-        )
-        .context("save_metadata")?;
+        save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
+            .await
+            .context("save_metadata")?;

        if let Some(remote_client) = &self.remote_client {
            for (path, layer_metadata) in layer_paths_to_upload {
@@ -3015,7 +3042,8 @@ impl Timeline {
                    self.tenant_id,
                    &img_range,
                    lsn,
-                )?;
+                )
+                .await?;

                fail_point!("image-layer-writer-fail-before-finish", |_| {
                    Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -3051,11 +3079,11 @@ impl Timeline {
                                }
                            }
                        };
-                        image_layer_writer.put_image(key, &img)?;
+                        image_layer_writer.put_image(key, &img).await?;
                        key = key.next();
                    }
                }
-                let image_layer = image_layer_writer.finish()?;
+                let image_layer = image_layer_writer.finish().await?;
                image_layers.push(image_layer);
            }
        }
@@ -3105,9 +3133,8 @@ impl Timeline {
                LayerFileMetadata::new(metadata.len(), self.generation),
            );

-            self.metrics
-                .resident_physical_size_gauge
-                .add(metadata.len());
+            // update metrics
+            self.metrics.record_new_file_metrics(metadata.len());
            let l = Arc::new(l);
            l.access_stats().record_residence_event(
                LayerResidenceStatus::Resident,
@@ -3600,7 +3627,11 @@ impl Timeline {
                    {
                        // ... if so, flush previous layer and prepare to write new one
                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next())
+                                .await?,
                        ));
                        writer = None;

@@ -3615,20 +3646,23 @@ impl Timeline {
            }
            if writer.is_none() {
                // Create writer if not initiaized yet
-                writer = Some(DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    key,
-                    if dup_end_lsn.is_valid() {
-                        // this is a layer containing slice of values of the same key
-                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                        dup_start_lsn..dup_end_lsn
-                    } else {
-                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                        lsn_range.clone()
-                    },
-                )?);
+                writer = Some(
+                    DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_id,
+                        key,
+                        if dup_end_lsn.is_valid() {
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
+                        } else {
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )
+                    .await?,
+                );
            }

            fail_point!("delta-layer-writer-fail-before-finish", |_| {
@@ -3637,11 +3671,11 @@ impl Timeline {
                )))
            });

-            writer.as_mut().unwrap().put_value(key, lsn, value)?;
+            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
+            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next()).await?));
        }

        // Sync layers
@@ -3782,10 +3816,8 @@ impl Timeline {
                )?;
            }

-            // update the timeline's physical size
-            self.metrics
-                .resident_physical_size_gauge
-                .add(metadata.len());
+            // update metrics, including the timeline's physical size
+            self.metrics.record_new_file_metrics(metadata.len());

            new_layer_paths.insert(
                new_delta_path,
@@ -3830,13 +3862,7 @@ impl Timeline {

        // Also schedule the deletions in remote storage
        if let Some(remote_client) = &self.remote_client {
-            let deletion_queue = self
-                .deletion_queue_client
-                .as_ref()
-                .ok_or_else(|| anyhow::anyhow!("Remote storage enabled without deletion queue"))?;
-            remote_client
-                .schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
-                .await?;
+            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
        }

        Ok(())
@@ -4149,7 +4175,8 @@ impl Timeline {
        if !layers_to_remove.is_empty() {
            // Persist the new GC cutoff value in the metadata file, before
            // we actually remove anything.
-            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())
+                .await?;

            // Actually delete the layers from disk and remove them from the map.
            // (couldn't do this in the loop above, because you cannot modify a collection
@@ -4170,15 +4197,7 @@ impl Timeline {
            }

            if let Some(remote_client) = &self.remote_client {
-                // Remote metadata upload was scheduled in `update_metadata_file`: wait
-                // for completion before scheduling any deletions.
-                remote_client.wait_completion().await?;
-                let deletion_queue = self.deletion_queue_client.as_ref().ok_or_else(|| {
-                    anyhow::anyhow!("Remote storage enabled without deletion queue")
-                })?;
-                remote_client
-                    .schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
-                    .await?;
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
            }

            apply.flush();
@@ -4768,7 +4787,6 @@ mod tests {

    use utils::{id::TimelineId, lsn::Lsn};

-    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};

    use super::{EvictionError, Timeline};
@@ -4778,30 +4796,8 @@ mod tests {
        let harness =
            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();

-        let remote_storage = {
-            // this is never used for anything, because of how the create_test_timeline works, but
-            // it is with us in spirit and a Some.
-            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
-            let path = harness.conf.workdir.join("localfs");
-            std::fs::create_dir_all(&path).unwrap();
-            let config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
-                storage: RemoteStorageKind::LocalFs(path),
-            };
-            GenericRemoteStorage::from_config(&config).unwrap()
-        };
-        let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
-
        let ctx = any_context();
-        let tenant = harness
-            .try_load(
-                &ctx,
-                Some(remote_storage),
-                Some(deletion_queue.new_client()),
-            )
-            .await
-            .unwrap();
+        let tenant = harness.try_load(&ctx).await.unwrap();
        let timeline = tenant
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
@@ -4851,30 +4847,8 @@ mod tests {
    async fn layer_eviction_aba_fails() {
        let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();

-        let remote_storage = {
-            // this is never used for anything, because of how the create_test_timeline works, but
-            // it is with us in spirit and a Some.
-            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
-            let path = harness.conf.workdir.join("localfs");
-            std::fs::create_dir_all(&path).unwrap();
-            let config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
-                storage: RemoteStorageKind::LocalFs(path),
-            };
-            GenericRemoteStorage::from_config(&config).unwrap()
-        };
-        let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
-
        let ctx = any_context();
-        let tenant = harness
-            .try_load(
-                &ctx,
-                Some(remote_storage),
-                Some(deletion_queue.new_client()),
-            )
-            .await
-            .unwrap();
+        let tenant = harness.try_load(&ctx).await.unwrap();
        let timeline = tenant
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,7 +14,6 @@ use utils::{

 use crate::{
    config::PageServerConf,
-    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
@@ -239,6 +238,15 @@ async fn delete_local_layer_files(
    Ok(())
 }

+/// Removes remote layers and an index file after them.
+async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
+    if let Some(remote_client) = &timeline.remote_client {
+        remote_client.delete_all().await.context("delete_all")?
+    };
+
+    Ok(())
+}
+
 // This function removs remaining traces of a timeline on disk.
 // Namely: metadata file, timeline directory, delete mark.
 // Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
@@ -399,7 +407,6 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
-        deletion_queue_client: Option<DeletionQueueClient>,
        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -409,10 +416,7 @@ impl DeleteTimelineFlow {
                timeline_id,
                local_metadata,
                None, // Ancestor is not needed for deletion.
-                TimelineResources {
-                    remote_client,
-                    deletion_queue_client,
-                },
+                TimelineResources { remote_client },
                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
@@ -555,7 +559,7 @@ impl DeleteTimelineFlow {
    ) -> Result<(), DeleteTimelineError> {
        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;

-        timeline.delete_all_remote().await?;
+        delete_remote_layers_and_index(timeline).await?;

        pausable_failpoint!("in_progress_delete");

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -328,9 +328,24 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
-            // likely, we're shutting down
-            return ControlFlow::Break(());
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
+            Ok(t) => t,
+            Err(_) => {
+                return ControlFlow::Break(());
+            }
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -147,11 +147,7 @@ pub(super) fn reconcile(
                Err(FutureLayer { local })
            } else {
                Ok(match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => {
-                        assert_eq!(local.generation, remote.generation);
-
-                        UseRemote { local, remote }
-                    }
+                    (Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
                    (Some(x), Some(_)) => UseLocal(x),
                    (None, Some(x)) => Evicted(x),
                    (Some(x), None) => NeedsUpload(x),
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -87,7 +87,7 @@ impl LayerManager {

    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
    /// called within `get_layer_for_write`.
-    pub(crate) fn get_layer_for_write(
+    pub(crate) async fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
        last_record_lsn: Lsn,
@@ -129,7 +129,7 @@ impl LayerManager {
                lsn
            );

-            let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?;
+            let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -135,7 +135,7 @@ impl WalReceiver {
        .await;
    }

-    pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
+    pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
        self.manager_status.read().unwrap().clone()
    }
 }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,4 +1,7 @@
+use crate::metrics::RemoteOpFileKind;
+
 use super::storage_layer::LayerFileName;
+use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -60,6 +63,7 @@ pub(crate) struct UploadQueueInitialized {
    // Breakdown of different kinds of tasks currently in-progress
    pub(crate) num_inprogress_layer_uploads: usize,
    pub(crate) num_inprogress_metadata_uploads: usize,
+    pub(crate) num_inprogress_deletions: usize,

    /// Tasks that are currently in-progress. In-progress means that a tokio Task
    /// has been launched for it. An in-progress task can be busy uploading, but it can
@@ -117,6 +121,7 @@ impl UploadQueue {
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
        };
@@ -158,6 +163,7 @@ impl UploadQueue {
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
        };
@@ -195,6 +201,14 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }

+#[derive(Debug)]
+pub(crate) struct Delete {
+    pub(crate) file_kind: RemoteOpFileKind,
+    pub(crate) layer_file_name: LayerFileName,
+    pub(crate) scheduled_from_timeline_delete: bool,
+    pub(crate) generation: Generation,
+}
+
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
@@ -203,6 +217,9 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),

+    /// Delete a layer file
+    Delete(Delete),
+
    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
 }
@@ -213,14 +230,22 @@ impl std::fmt::Display for UploadOp {
            UploadOp::UploadLayer(path, metadata) => {
                write!(
                    f,
-                    "UploadLayer({}, size={:?})",
+                    "UploadLayer({}, size={:?}, gen={:?})",
                    path.file_name(),
-                    metadata.file_size()
+                    metadata.file_size(),
+                    metadata.generation,
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
                write!(f, "UploadMetadata(lsn: {})", lsn)
            }
+            UploadOp::Delete(delete) => write!(
+                f,
+                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
+                delete.layer_file_name.file_name(),
+                delete.scheduled_from_timeline_delete,
+                delete.generation
+            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/test.log
+++ b/pageserver/src/test.log
@@ -1 +0,0 @@
-bash: scripts/pytest: No such file or directory
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`-bash: scripts/pytest: No such file or directory`