test: rename previous test, cleanup, still does not work

fix: provide better context for the other test
test: actually duplicate L1 layer in test
2026-05-04 14:50:38 +00:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 10:31:56 +03:00
224 changed files with 6076 additions and 16237 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -14,12 +14,10 @@
 !pgxn/
 !proxy/
 !safekeeper/
-!s3_scrubber/
 !storage_broker/
 !trace/
 !vendor/postgres-v14/
 !vendor/postgres-v15/
-!vendor/postgres-v16/
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,8 +0,0 @@
-self-hosted-runner:
-  labels:
-    - gen3
-    - large
-    - small
-    - us-east-2
-config-variables:
-  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -70,9 +70,6 @@ runs:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest
-        # The lack of compatibility snapshot (for example, for the new Postgres version)
-        # shouldn't fail the whole job. Only relevant test should fail.
-        skip-if-does-not-exist: true

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
@@ -148,11 +145,7 @@ runs:

        if [ "${RERUN_FLAKY}" == "true" ]; then
          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
-                                              --days 7 \
-                                              --output "$TEST_OUTPUT/flaky.json" \
-                                              --pg-version "${DEFAULT_PG_VERSION}" \
-                                              --build-type "${BUILD_TYPE}"
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"

          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -1,31 +0,0 @@
-name: Lint GitHub Workflows
-
-on:
-  push:
-    branches:
-      - main
-      - release
-    paths:
-      - '.github/workflows/*.ya?ml'
-  pull_request:
-    paths:
-      - '.github/workflows/*.ya?ml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: reviewdog/action-actionlint@v1
-        env:
-          # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046
-          # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
-          SHELLCHECK_OPTS: --exclude=SC2046,SC2086
-        with:
-          fail_on_error: true
-          filter_mode: nofilter
-          level: error
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -2,9 +2,7 @@ name: Handle `approved-for-ci-run` label
 # This workflow helps to run CI pipeline for PRs made by external contributors (from forks).

 on:
-  pull_request_target:
-    branches:
-      - main
+  pull_request:
    types:
      # Default types that triggers a workflow ([1]):
      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
@@ -19,83 +17,39 @@ on:
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
-  BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"
-
-permissions: write-all
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}

 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

+    runs-on: [ ubuntu-latest ]
+
    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-latest
-
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

-  create-or-update-pr-for-ci-run:
-    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
+  create-branch:
+    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
+
+    runs-on: [ ubuntu-latest ]

    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

-    runs-on: ubuntu-latest
-
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

      - uses: actions/checkout@v3
        with:
          ref: main
-          token: ${{ secrets.CI_ACCESS_TOKEN }}

      - run: gh pr checkout "${PR_NUMBER}"

-      - run: git checkout -b "${BRANCH}"
+      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"

-      - run: git push --force origin "${BRANCH}"
-
-      - name: Create a Pull Request for CI run (if required)
-        env:
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          cat << EOF > body.md
-            This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER}
-
-            Please do not alter or merge/close it.
-
-            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
-          EOF
-
-          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --base main --json number --jq '.[].number')"
-          if [ -z "${ALREADY_CREATED}" ]; then
-            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
-                                                       --body-file "body.md" \
-                                                       --head "${BRANCH}" \
-                                                       --base "main" \
-                                                       --draft
-          fi
-
-  cleanup:
-    # Close PRs and delete branchs if the original PR is closed.
-
-    if: |
-      github.event.action == 'closed' &&
-      github.event.pull_request.head.repo.full_name != github.repository
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - run: |
-          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --json 'closed' --jq '.[].closed')"
-          if [ "${CLOSED}" == "false" ]; then
-            gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
-          fi
+      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -117,7 +117,6 @@ jobs:
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
-      tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}

    steps:
    - name: Generate matrix for pgbench benchmark
@@ -137,11 +136,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

-        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT

    - name: Generate matrix for OLAP benchmarks
      id: olap-compare-matrix
@@ -153,30 +152,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
+          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
                                                   { "platform": "rds-aurora"   }]')
        fi

-        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
-
-    - name: Generate matrix for TPC-H benchmarks
-      id: tpch-compare-matrix
-      run: |
-        matrix='{
-          "platform": [
-            "neon-captest-reuse"
-          ],
-          "scale": [
-            "10"
-          ]
-        }'
-
-        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                   { "platform": "rds-aurora",   "scale": "10" }]')
-        fi
-
-        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
    needs: [ generate-matrices ]
@@ -253,11 +233,7 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
-        fi
-        psql ${CONNSTR} -c "${QUERY}"
+        psql ${CONNSTR} -c "SELECT version();"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -382,11 +358,7 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
-        fi
-        psql ${CONNSTR} -c "${QUERY}"
+        psql ${CONNSTR} -c "SELECT version();"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -400,7 +372,6 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        TEST_OLAP_SCALE: 10

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -427,7 +398,7 @@ jobs:

    strategy:
      fail-fast: false
-      matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -436,7 +407,6 @@ jobs:
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
-      TEST_OLAP_SCALE: ${{ matrix.scale }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -458,17 +428,18 @@ jobs:
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

-    - name: Get Connstring Secret Name
+    - name: Set up Connection String
+      id: set-up-connstr
      run: |
        case "${PLATFORM}" in
          neon-captest-reuse)
-            ENV_PLATFORM=CAPTEST_TPCH
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
            ;;
          rds-aurora)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
            ;;
          rds-postgres)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -476,21 +447,9 @@ jobs:
            ;;
        esac

-        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
-        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
-
-    - name: Set up Connection String
-      id: set-up-connstr
-      run: |
-        CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
-
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
-        fi
-        psql ${CONNSTR} -c "${QUERY}"
+        psql ${CONNSTR} -c "SELECT version();"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -504,7 +463,6 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        TEST_OLAP_SCALE: ${{ matrix.scale }}

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -576,11 +534,7 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
-        fi
-        psql ${CONNSTR} -c "${QUERY}"
+        psql ${CONNSTR} -c "SELECT version();"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,6 +5,7 @@ on:
    branches:
      - main
      - release
+      - ci-run/pr-*
  pull_request:

 defaults:
@@ -23,30 +24,7 @@ env:
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
-  check-permissions:
-    runs-on: ubuntu-latest
-
-    steps:
-    - name: Disallow PRs from forks
-      if: |
-        github.event_name == 'pull_request' &&
-        github.event.pull_request.head.repo.full_name != github.repository
-
-      run: |
-        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
-          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
-        else
-          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
-        fi
-
-        echo >&2 "We don't run CI for PRs from forks"
-        echo >&2 "${MESSAGE}"
-
-        exit 1
-
-
  tag:
-    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
@@ -75,7 +53,6 @@ jobs:
        id: build-tag

  check-codestyle-python:
-    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -108,7 +85,6 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
-    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -175,7 +151,6 @@ jobs:
        run: cargo deny check

  build-neon:
-    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -212,7 +187,7 @@ jobs:
          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603

          FAILED=false
-          for postgres in postgres-v14 postgres-v15 postgres-v16; do
+          for postgres in postgres-v14 postgres-v15; do
            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
            actual=$(git rev-parse "HEAD:vendor/${postgres}")
            if [ "${expected}" != "${actual}" ]; then
@@ -234,10 +209,6 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
      # Set some environment variables used by all the steps.
      #
      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -258,12 +229,10 @@ jobs:
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
          fi
-          {
-            echo "cov_prefix=${cov_prefix}"
-            echo "CARGO_FEATURES=${CARGO_FEATURES}"
-            echo "CARGO_FLAGS=${CARGO_FLAGS}"
-            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
-          } >> $GITHUB_ENV
+          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
+          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
+          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV

      # Disabled for now
      # Don't include the ~/.cargo/registry/src directory. It contains just
@@ -298,13 +267,6 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v3
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
        run: mold -run make postgres-v14 -j$(nproc)
@@ -313,10 +275,6 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: mold -run make postgres-v15 -j$(nproc)

-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
-
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

@@ -390,17 +348,17 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
+    needs: [ build-neon ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
-        pg_version: [ v14, v15, v16 ]
+        pg_version: [ v14, v15 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -428,12 +386,12 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
-    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
+    needs: [ build-neon ]
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
@@ -460,13 +418,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
-    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
-
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
+    needs: [ regress-tests, benchmarks ]
+    if: ${{ !cancelled() }}

    steps:
      - uses: actions/checkout@v3
@@ -492,40 +449,42 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

-            const coverage = {
-              coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}",
-              summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}",
-            }
-
            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
              fetch,
              report,
-              coverage,
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests ]
-
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
+    needs: [ regress-tests ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug ]
-    outputs:
-        coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
-        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: true
-          fetch-depth: 0
+          fetch-depth: 1
+
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -568,45 +527,13 @@ jobs:
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

-      - name: Build coverage report NEW
-        id: upload-coverage-report-new
-        env:
-          BUCKET: neon-github-public-dev
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        run: |
-          BASELINE="$(git merge-base HEAD origin/main)"
-          CURRENT="${COMMIT_SHA}"
-
-          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
-
-          GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
-
-          # Use differential coverage if the baseline coverage exists.
-          # It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit.
-          if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then
-            git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff
-
-            GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}"
-          fi
-
-          genhtml ${GENHTML_ARGS}
-
-          aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov
-
-          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html
-          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-
-          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
-          echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
-
      - uses: actions/github-script@v6
        env:
          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
-          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
          script: |
-            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
+            const { REPORT_URL, COMMIT_SHA } = process.env

            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
@@ -617,21 +544,12 @@ jobs:
              context: 'Code coverage report',
            })

-            await github.rest.repos.createCommitStatus({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              sha: `${COMMIT_SHA}`,
-              state: 'success',
-              target_url: `${REPORT_URL_NEW}`,
-              context: 'Code coverage report NEW',
-            })
-
  trigger-e2e-tests:
-    needs: [ check-permissions, promote-images, tag ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
+    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -672,8 +590,8 @@ jobs:
            }"

  neon-image:
-    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -720,7 +638,7 @@ jobs:

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, tag ]
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -765,17 +683,17 @@ jobs:
        run: rm -rf ~/.ecr

  compute-node-image:
-    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: gcr.io/kaniko-project/executor:v1.9.2-debug
      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
      options: --add-host=download.osgeo.org:140.211.15.30
+    needs: [ tag ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -824,12 +742,12 @@ jobs:
        run: rm -rf ~/.ecr

  vm-compute-node-image:
-    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
+    needs: [ tag, compute-node-image ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -866,7 +784,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -909,8 +827,8 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
-    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: [ self-hosted, gen3, small ]
+    needs: [ tag, test-images, vm-compute-node-image ]
    container: golang:1.19-bullseye
    # Don't add if-condition here.
    # The job should always be run because we have dependant other jobs that shouldn't be skipped
@@ -930,7 +848,6 @@ jobs:
        run: |
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
        if: |
@@ -943,8 +860,6 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -957,8 +872,6 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest

      - name: Configure Docker Hub login
        run: |
@@ -970,7 +883,6 @@ jobs:
        run: |
          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
        if: |
@@ -983,19 +895,21 @@ jobs:
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  trigger-custom-extensions-build-and-wait:
-    needs: [ check-permissions, tag ]
-    runs-on: ubuntu-latest
+  build-private-extensions:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
+    needs: [ tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
-          COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

          curl -f -X POST \
@@ -1025,50 +939,11 @@ jobs:
              }
            }"

-      - name: Wait for extension build to finish
-        env:
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
-          INTERVAL=15 # try each N seconds
-
-          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
-
-          for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do
-            sleep $INTERVAL
-
-            # Get statuses for the latest commit in the PR / branch
-            gh api \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" \
-              "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
-
-            # Get the latest status for the "build-and-upload-extensions" context
-            last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
-            if [ "${last_status}" = "pending" ]; then
-              # Extension build is still in progress.
-              continue
-            elif [ "${last_status}" = "success" ]; then
-              # Extension build is successful.
-              exit 0
-            else
-              # Status is neither "pending" nor "success", exit the loop and fail the job.
-              break
-            fi
-          done
-
-          # Extension build failed, print `statuses.json` for debugging and fail the job.
-          jq '.' statuses.json
-
-          echo >&2 "Status of extension build is '${last_status}' != 'success'"
-          exit 1
-
  deploy:
-    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
-
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    needs: [ promote-images, tag, regress-tests ]
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
        run: |
@@ -1106,35 +981,20 @@ jobs:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
-            await github.rest.git.createRef({
+            github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })

-      - name: Create GitHub release
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v6
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.repos.createRelease({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              tag_name: "${{ needs.tag.outputs.build-tag }}",
-              generate_release_notes: true,
-            })
-
  promote-compatibility-data:
-    needs: [ check-permissions, promote-images, tag, regress-tests ]
-    if: github.ref_name == 'release'
-
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
+    needs: [ promote-images, tag, regress-tests ]
+    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
    steps:
      - name: Promote compatibility snapshot for the release
        env:
@@ -1142,7 +1002,7 @@ jobs:
          PREFIX: artifacts/latest
        run: |
          # Update compatibility snapshot for the release
-          for pg_version in v14 v15 v16; do
+          for pg_version in v14 v15; do
            for build_type in debug release; do
              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -4,6 +4,7 @@ on:
  push:
    branches:
      - main
+      - ci-run/pr-*
  pull_request:

 defaults:
@@ -38,7 +39,7 @@ jobs:
          fetch-depth: 1

      - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf icu4c pkg-config
+        run: brew install flex bison openssl protobuf

      - name: Set pg 14 revision for caching
        id: pg_v14_rev
@@ -48,10 +49,6 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
      - name: Cache postgres v14 build
        id: cache_pg_14
        uses: actions/cache@v3
@@ -66,13 +63,6 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v3
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
      - name: Set extra env for macOS
        run: |
          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -96,10 +86,6 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: make postgres-v15 -j$(nproc)

-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(nproc)
-
      - name: Build neon extensions
        run: make neon-pg-ext -j$(nproc)

--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -1,29 +0,0 @@
-name: Notify Slack channel about upcoming release
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.number }}
-  cancel-in-progress: true
-
-on:
-  pull_request:
-    branches:
-      - release
-    types:
-      # Default types that triggers a workflow:
-      # - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
-      - opened
-      - synchronize
-      - reopened
-      # Additional types that we want to handle:
-      - closed
-
-jobs:
-  notify:
-    runs-on: [ ubuntu-latest ]
-
-    steps:
-      - uses: neondatabase/dev-actions/release-pr-notify@main
-        with:
-          slack-token: ${{ secrets.SLACK_BOT_TOKEN }}
-          slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications`
-          github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,19 +2,16 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 2'
+    - cron: '0 10 * * 2'
  workflow_dispatch:

 jobs:
  create_release_branch:
-    runs-on: [ ubuntu-latest ]
-
-    permissions:
-      contents: write # for `git push`
+    runs-on: [ubuntu-latest]

    steps:
    - name: Check out code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v3
      with:
        ref: main

@@ -29,16 +26,9 @@ jobs:
      run: git push origin releases/${{ steps.date.outputs.date }}

    - name: Create pull request into release
-      env:
-        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-      run: |
-        cat << EOF > body.md
-          ## Release ${{ steps.date.outputs.date }}
-
-          **Please merge this PR using 'Create a merge commit'!**
-        EOF
-
-        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
-                     --body-file "body.md" \
-                     --head "releases/${{ steps.date.outputs.date }}" \
-                     --base "release"
+      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
+      with:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        head: releases/${{ steps.date.outputs.date }}
+        base: release
+        title: Release ${{ steps.date.outputs.date }}
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,7 +6,3 @@
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_15_STABLE_neon
-[submodule "vendor/postgres-v16"]
-	path = vendor/postgres-v16
-	url = https://github.com/neondatabase/postgres.git
-	branch = REL_16_STABLE_neon
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,28 +27,3 @@ your patch's fault. Help to fix the root cause if something else has
 broken the CI, before pushing.

 *Happy Hacking!*
-
-# How to run a CI pipeline on Pull Requests from external contributors
-_An instruction for maintainers_
-
-## TL;DR:
- Review the PR
- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
-    - Press the "Approve and run" button in GitHub UI
-    - Add the `approved-for-ci-run` label to the PR
-
-Repeat all steps after any change to the PR.
- When the changes are ready to get merged — merge the original PR (not the internal one)
-
-## Longer version:
-
-GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons).
-So, passing the CI pipeline on Pull Requests from external contributors is impossible.
-
-We're using the following approach to make it work:
- After the review, assign the `approved-for-ci-run` label to the PR if changes look safe
- A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`)
- Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI)
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
-
-For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,4 @@
 [workspace]
-resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
@@ -8,7 +7,6 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
-    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -39,11 +37,11 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.56", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.29"
-aws-smithy-http = "0.56"
-aws-credential-types = "0.56"
-aws-types = "0.56"
+aws-config = { version = "0.55", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.27"
+aws-smithy-http = "0.55"
+aws-credential-types = "0.55"
+aws-types = "0.55"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -107,12 +105,12 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
-rustls = "0.21"
+rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
-sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
@@ -127,11 +125,11 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.1"
 thiserror = "1.0"
-tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
+tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.10.0"
-tokio-rustls = "0.24"
+tokio-postgres-rustls = "0.9.0"
+tokio-rustls = "0.23"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
@@ -145,7 +143,7 @@ tracing-subscriber = { version = "0.3", default_features = false, features = ["s
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.25"
+webpki-roots = "0.23"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -184,8 +182,8 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.11"
-rstest = "0.18"
+rcgen = "0.10"
+rstest = "0.17"
 tempfile = "3.4"
 tonic-build = "0.9"

--- a/4
+++ b/4
@@ -12,7 +12,6 @@ WORKDIR /home/nonroot

 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
-COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -40,7 +39,6 @@ ARG CACHEPOT_BUCKET=neon-github-dev

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
@@ -67,7 +65,6 @@ RUN set -e \
    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
-        libicu67 \
        openssl \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
@@ -84,7 +81,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
-COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,8 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar

 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
-    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
+    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
@@ -124,21 +124,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLV8_VERSION=3.1.5 \
-        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
-        ;; \
-      "v16") \
-        export PLV8_VERSION=3.1.8 \
-        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
-    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
+    echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -185,8 +172,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
    cp -R /h3/usr / && \
    rm -rf build

-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
-    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
+    echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -224,8 +211,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
-    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
+    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -256,8 +243,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
-    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
+    echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -320,8 +307,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
-    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \
+    echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -336,8 +323,8 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
-    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
+    echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -352,8 +339,8 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
-    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
+    echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -368,8 +355,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -384,21 +371,12 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz
 FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export TIMESCALEDB_VERSION=2.10.1 \
-        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
-        ;; \
-      *) \
-        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
-    esac && \
-    apt-get update && \
+RUN apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
-    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
+    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
@@ -427,10 +405,6 @@ RUN case "${PG_VERSION}" in \
        export PG_HINT_PLAN_VERSION=15_1_5_0 \
        export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
        ;; \
-      "v16") \
-        export PG_HINT_PLAN_VERSION=16_1_6_0 \
-        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
-        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
        ;; \
@@ -478,8 +452,8 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
-    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
+    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -505,8 +479,8 @@ RUN apt-get update && \
        libfreetype6-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
-    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
+    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -577,16 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PG_EMBEDDING_VERSION=0.3.5 \
-        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
-        ;; \
-      *) \
-        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
-    esac && \
-    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
-    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -618,10 +584,6 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 # Layer "rust extensions"
 # This layer is used to build `pgx` deps
 #
-# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
-# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
-# dependency on all the rust extension that depend on it, too.
-#
 #########################################################################################
 FROM build-deps AS rust-extensions-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -636,17 +598,7 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
-        ;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
@@ -663,21 +615,10 @@ USER root
 #########################################################################################

 FROM rust-extensions-build AS pg-jsonschema-pg-build
-ARG PG_VERSION

 # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
 # there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -692,23 +633,12 @@ RUN case "${PG_VERSION}" in \
 #########################################################################################

 FROM rust-extensions-build AS pg-graphql-pg-build
-ARG PG_VERSION

 # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
 # Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
 # pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
 # same 1.1 version we've used before.
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
+RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -726,20 +656,9 @@ RUN case "${PG_VERSION}" in \
 #########################################################################################

 FROM rust-extensions-build AS pg-tiktoken-pg-build
-ARG PG_VERSION

 # 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    cargo pgx install --release && \
@@ -753,19 +672,8 @@ RUN case "${PG_VERSION}" in \
 #########################################################################################

 FROM rust-extensions-build AS pg-pgx-ulid-build
-ARG PG_VERSION

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -818,20 +726,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_rmgr \
-        -s install && \
-    case "${PG_VERSION}" in \
-        "v14" | "v15") \
-        ;; \
-        "v16") \
-            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
-        ;; \
-        *) \
-            echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-        esac && \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/hnsw \
--- a/40
+++ b/40
@@ -29,7 +29,6 @@ else ifeq ($(UNAME_S),Darwin)
 	# It can be configured with OPENSSL_PREFIX variable
 	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
 	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
 	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
@@ -84,8 +83,6 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
-.PHONY: postgres-configure-v16
-postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
 .PHONY: postgres-configure-v14
@@ -121,10 +118,6 @@ postgres-clean-%:
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean

-.PHONY: postgres-check-%
-postgres-check-%: postgres-%
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
-
 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
 	+@echo "Compiling neon $*"
@@ -137,11 +130,6 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
-	+@echo "Compiling neon_rmgr $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
 	+@echo "Compiling neon_test_utils $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -152,13 +140,6 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
-
-# pg_embedding was temporarily released as hnsw from this repo, when we only
-# supported PostgreSQL 14 and 15
-neon-pg-ext-v14: neon-pg-ext-hnsw-v14
-neon-pg-ext-v15: neon-pg-ext-hnsw-v15
-
-neon-pg-ext-hnsw-%: postgres-headers-% postgres-%
 	+@echo "Compiling hnsw $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -186,39 +167,28 @@ neon-pg-ext-clean-%:
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
-	neon-pg-ext-v15 \
-	neon-pg-ext-v16
+	neon-pg-ext-v15

 .PHONY: neon-pg-ext-clean
 neon-pg-ext-clean: \
 	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15 \
-	neon-pg-ext-clean-v16
+	neon-pg-ext-clean-v15

 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
-	postgres-v15 \
-	postgres-v16
+	postgres-v15

 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
-	postgres-headers-v15 \
-	postgres-headers-v16
+	postgres-headers-v15

 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
-	postgres-clean-v15 \
-	postgres-clean-v16
-
-.PHONY: postgres-check
-postgres-check: \
-	postgres-check-v14 \
-	postgres-check-v15 \
-	postgres-check-v16
+	postgres-clean-v15

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
--- a/README.md
+++ b/README.md
@@ -29,18 +29,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry lsof
+libcurl4-openssl-dev openssl python-poetry
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry lsof
+  protobuf-devel libcurl-devel openssl poetry
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf curl lsof
+postgresql-libs cmake postgresql protobuf curl
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf openssl flex bison icu4c pkg-config
+brew install protobuf openssl flex bison

 # add openssl to PATH, required for ed25519 keys generation in neon_local
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
--- a/clippy.toml
+++ b/clippy.toml
@@ -1,5 +0,0 @@
-disallowed-methods = [
-    "tokio::task::block_in_place",
-    # Allow this for now, to deny it later once we stop using Handle::block_on completely
-    # "tokio::runtime::Handle::block_on",
-]
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -19,10 +19,9 @@ Also `compute_ctl` spawns two separate service threads:
 - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
  last activity requests.

-If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
-`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
-`vm-monitor` communicates with the VM autoscaling system. It coordinates
-downscaling and requests immediate upscaling under resource pressure.
+If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
+compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
+downscaling and (eventually) will request immediate upscaling under resource pressure.

 Usage example:
 ```sh
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -20,10 +20,9 @@
 //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
 //!   last activity requests.
 //!
-//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
-//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
-//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
-//! downscaling and requests immediate upscaling under resource pressure.
+//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
+//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
+//! downscaling and (eventually) will request immediate upscaling under resource pressure.
 //!
 //! Usage example:
 //! ```sh
@@ -279,9 +278,8 @@ fn main() -> Result<()> {
            use tokio_util::sync::CancellationToken;
            use tracing::warn;
            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");
-            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
+            let cgroup = matches.get_one::<String>("filecache-connstr");
+            let file_cache_connstr = matches.get_one::<String>("cgroup");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
@@ -314,7 +312,6 @@ fn main() -> Result<()> {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
                        addr: vm_monitor_addr.cloned().unwrap(),
-                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -485,11 +482,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("file-cache-on-disk")
-                .long("file-cache-on-disk")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 #[test]
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,39 +1,12 @@
-use anyhow::{anyhow, Ok, Result};
-use postgres::Client;
+use anyhow::{anyhow, Result};
 use tokio_postgres::NoTls;
-use tracing::{error, instrument, warn};
+use tracing::{error, instrument};

 use crate::compute::ComputeNode;

-/// Create a special service table for availability checks
-/// only if it does not exist already.
-pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
-    let query = "
-        DO $$
-        BEGIN
-            IF NOT EXISTS(
-                SELECT 1
-                FROM pg_catalog.pg_tables
-                WHERE tablename = 'health_check'
-            )
-            THEN
-            CREATE TABLE health_check (
-                id serial primary key,
-                updated_at timestamptz default now()
-            );
-            INSERT INTO health_check VALUES (1, now())
-                ON CONFLICT (id) DO UPDATE
-                 SET updated_at = now();
-            END IF;
-        END
-        $$;";
-    client.execute(query, &[])?;
-
-    Ok(())
-}
-
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
+/// Create table if it's missing.
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    // Connect to the database.
@@ -51,28 +24,21 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    });

    let query = "
+    CREATE TABLE IF NOT EXISTS health_check (
+        id serial primary key,
+        updated_at timestamptz default now()
+    );
    INSERT INTO health_check VALUES (1, now())
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";

-    match client.simple_query(query).await {
-        Result::Ok(result) => {
-            if result.len() != 1 {
-                return Err(anyhow::anyhow!(
-                    "expected 1 query results, but got {}",
-                    result.len()
-                ));
-            }
-        }
-        Err(err) => {
-            if let Some(state) = err.code() {
-                if state == &tokio_postgres::error::SqlState::DISK_FULL {
-                    warn!("Tenant disk is full");
-                    return Ok(());
-                }
-            }
-            return Err(err.into());
-        }
+    let result = client.simple_query(query).await?;
+
+    if result.len() != 2 {
+        return Err(anyhow::format_err!(
+            "expected 2 query results, but got {}",
+            result.len()
+        ));
    }

    Ok(())
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -27,7 +27,6 @@ use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};

-use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -697,7 +696,6 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
-        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -1080,8 +1078,7 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) =
-                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
+            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -46,6 +46,8 @@ pub fn write_postgres_conf(
        writeln!(file, "{}", conf)?;
    }

+    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
+
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -74,7 +74,6 @@ More specifically, here is an example ext_index.json
 use anyhow::Context;
 use anyhow::{self, Result};
 use compute_api::spec::RemoteExtSpec;
-use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
@@ -107,71 +106,16 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {

 pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
-    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
    let human_version = get_pg_config("--version", pgbin);
-    return parse_pg_version(&human_version).to_string();
-}
-
-fn parse_pg_version(human_version: &str) -> &str {
-    // Normal releases have version strings like "PostgreSQL 15.4". But there
-    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
-    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
-    // configure option, you can tack any string to the version number,
-    // e.g. "PostgreSQL 15.4foobar".
-    match Regex::new(r"^PostgreSQL (?<major>\d+).+")
-        .unwrap()
-        .captures(human_version)
-    {
-        Some(captures) if captures.len() == 2 => match &captures["major"] {
-            "14" => return "v14",
-            "15" => return "v15",
-            "16" => return "v16",
-            _ => {}
-        },
-        _ => {}
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
    }
    panic!("Unsuported postgres version {human_version}");
 }

-#[cfg(test)]
-mod tests {
-    use super::parse_pg_version;
-
-    #[test]
-    fn test_parse_pg_version() {
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            "v15"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            "v14"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_unsupported_version() {
-        parse_pg_version("PostgreSQL 13.14");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_incorrect_version_format() {
-        parse_pg_version("PostgreSQL 14");
-    }
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
@@ -236,19 +180,7 @@ pub async fn download_extension(
 // Create extension control files from spec
 pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    for (ext_name, ext_data) in remote_extensions.extension_data.iter() {
-        // Check if extension is present in public or custom.
-        // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &remote_extensions.public_extensions {
-            if !public_extensions.contains(ext_name) {
-                if let Some(custom_extensions) = &remote_extensions.custom_extensions {
-                    if !custom_extensions.contains(ext_name) {
-                        continue; // skip this extension, it is not allowed
-                    }
-                }
-            }
-        }
-
+    for ext_data in remote_extensions.extension_data.values() {
        for (control_name, control_content) in &ext_data.control_data {
            let control_path = local_sharedir.join(control_name);
            if !control_path.exists() {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,6 +1,4 @@
 use std::convert::Infallible;
-use std::net::IpAddr;
-use std::net::Ipv6Addr;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
@@ -171,12 +169,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    }
                };

-                remote_extensions.get_ext(
-                    &filename,
-                    is_library,
-                    &compute.build_tag,
-                    &compute.pgversion,
-                )
+                remote_extensions.get_ext(&filename, is_library)
            };

            match ext {
@@ -300,9 +293,7 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
-    // this usually binds to both IPv4 and IPv6 on linux
-    // see e.g. https://github.com/rust-lang/rust/pull/34440
-    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
+    let addr = SocketAddr::from(([0, 0, 0, 0], port));

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
 //   https://www.postgresql.org/docs/15/auth-password.html
 //
 // So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
-pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
+pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,8 +12,6 @@ git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
-hex.workspace = true
-hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
 serde.workspace = true
@@ -22,7 +20,6 @@ serde_with.workspace = true
 tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
-tokio.workspace = true
 url.workspace = true
 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,7 +1,6 @@
 # Minimal neon environment with one safekeeper. This is equivalent to the built-in
 # defaults that you get with no --config
-[[pageservers]]
-id=1
+[pageserver]
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
 pg_auth_type = 'Trust'
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,105 +0,0 @@
-use crate::{background_process, local_env::LocalEnv};
-use anyhow::anyhow;
-use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
-use std::{path::PathBuf, process::Child};
-use utils::id::{NodeId, TenantId};
-
-pub struct AttachmentService {
-    env: LocalEnv,
-    listen: String,
-    path: PathBuf,
-}
-
-const COMMAND: &str = "attachment_service";
-
-#[serde_as]
-#[derive(Serialize, Deserialize)]
-pub struct AttachHookRequest {
-    #[serde_as(as = "DisplayFromStr")]
-    pub tenant_id: TenantId,
-    pub pageserver_id: Option<NodeId>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct AttachHookResponse {
-    pub gen: Option<u32>,
-}
-
-impl AttachmentService {
-    pub fn from_env(env: &LocalEnv) -> Self {
-        let path = env.base_data_dir.join("attachments.json");
-
-        // Makes no sense to construct this if pageservers aren't going to use it: assume
-        // pageservers have control plane API set
-        let listen_url = env.control_plane_api.clone().unwrap();
-
-        let listen = format!(
-            "{}:{}",
-            listen_url.host_str().unwrap(),
-            listen_url.port().unwrap()
-        );
-
-        Self {
-            env: env.clone(),
-            path,
-            listen,
-        }
-    }
-
-    fn pid_file(&self) -> PathBuf {
-        self.env.base_data_dir.join("attachment_service.pid")
-    }
-
-    pub fn start(&self) -> anyhow::Result<Child> {
-        let path_str = self.path.to_string_lossy();
-
-        background_process::start_process(
-            COMMAND,
-            &self.env.base_data_dir,
-            &self.env.attachment_service_bin(),
-            ["-l", &self.listen, "-p", &path_str],
-            [],
-            background_process::InitialPidFile::Create(&self.pid_file()),
-            // TODO: a real status check
-            || Ok(true),
-        )
-    }
-
-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())
-    }
-
-    /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub fn attach_hook(
-        &self,
-        tenant_id: TenantId,
-        pageserver_id: NodeId,
-    ) -> anyhow::Result<Option<u32>> {
-        use hyper::StatusCode;
-
-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join("attach_hook")
-            .unwrap();
-        let client = reqwest::blocking::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");
-
-        let request = AttachHookRequest {
-            tenant_id,
-            pageserver_id: Some(pageserver_id),
-        };
-
-        let response = client.post(url).json(&request).send()?;
-        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {}", response.status()));
-        }
-
-        let response = response.json::<AttachHookResponse>()?;
-        Ok(response.gen)
-    }
-}
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -1,273 +0,0 @@
-/// The attachment service mimics the aspects of the control plane API
-/// that are required for a pageserver to operate.
-///
-/// This enables running & testing pageservers without a full-blown
-/// deployment of the Neon cloud platform.
-///
-use anyhow::anyhow;
-use clap::Parser;
-use hex::FromHex;
-use hyper::StatusCode;
-use hyper::{Body, Request, Response};
-use serde::{Deserialize, Serialize};
-use std::path::{Path, PathBuf};
-use std::{collections::HashMap, sync::Arc};
-use utils::logging::{self, LogFormat};
-
-use utils::{
-    http::{
-        endpoint::{self},
-        error::ApiError,
-        json::{json_request, json_response},
-        RequestExt, RouterBuilder,
-    },
-    id::{NodeId, TenantId},
-    tcp_listener,
-};
-
-use pageserver_api::control_api::{
-    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
-    ValidateResponseTenant,
-};
-
-use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
-
-#[derive(Parser)]
-#[command(author, version, about, long_about = None)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    /// Host and port to listen on, like `127.0.0.1:1234`
-    #[arg(short, long)]
-    listen: std::net::SocketAddr,
-
-    /// Path to the .json file to store state (will be created if it doesn't exist)
-    #[arg(short, long)]
-    path: PathBuf,
-}
-
-// The persistent state of each Tenant
-#[derive(Serialize, Deserialize, Clone)]
-struct TenantState {
-    // Currently attached pageserver
-    pageserver: Option<NodeId>,
-
-    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    generation: u32,
-}
-
-fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-    V: Clone + Serialize,
-{
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
-
-    transformed
-        .collect::<HashMap<String, V>>()
-        .serialize(serializer)
-}
-
-fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-    V: Deserialize<'de>,
-{
-    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
-    hex_map
-        .into_iter()
-        .map(|(k, v)| {
-            TenantId::from_hex(k)
-                .map(|k| (k, v))
-                .map_err(serde::de::Error::custom)
-        })
-        .collect()
-}
-
-// Top level state available to all HTTP handlers
-#[derive(Serialize, Deserialize)]
-struct PersistentState {
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
-    tenants: HashMap<TenantId, TenantState>,
-
-    #[serde(skip)]
-    path: PathBuf,
-}
-
-impl PersistentState {
-    async fn save(&self) -> anyhow::Result<()> {
-        let bytes = serde_json::to_vec(self)?;
-        tokio::fs::write(&self.path, &bytes).await?;
-
-        Ok(())
-    }
-
-    async fn load(path: &Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-        decoded.path = path.to_owned();
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path.display());
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path.display());
-                Self {
-                    tenants: HashMap::new(),
-                    path: path.to_owned(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
-            }
-        }
-    }
-}
-
-/// State available to HTTP request handlers
-#[derive(Clone)]
-struct State {
-    inner: Arc<tokio::sync::RwLock<PersistentState>>,
-}
-
-impl State {
-    fn new(persistent_state: PersistentState) -> State {
-        Self {
-            inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
-        }
-    }
-}
-
-#[inline(always)]
-fn get_state(request: &Request<Body>) -> &State {
-    request
-        .data::<Arc<State>>()
-        .expect("unknown state type")
-        .as_ref()
-}
-
-/// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let mut response = ReAttachResponse {
-        tenants: Vec::new(),
-    };
-    for (t, state) in &mut locked.tenants {
-        if state.pageserver == Some(reattach_req.node_id) {
-            state.generation += 1;
-            response.tenants.push(ReAttachResponseTenant {
-                id: *t,
-                generation: state.generation,
-            });
-        }
-    }
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
-/// Pageserver calls into this before doing deletions, to confirm that it still
-/// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
-
-    let locked = get_state(&req).inner.read().await;
-
-    let mut response = ValidateResponse {
-        tenants: Vec::new(),
-    };
-
-    for req_tenant in validate_req.tenants {
-        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-            let valid = tenant_state.generation == req_tenant.gen;
-            response.tenants.push(ValidateResponseTenant {
-                id: req_tenant.id,
-                valid,
-            });
-        }
-    }
-
-    json_response(StatusCode::OK, response)
-}
-/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
-/// (in the real control plane this is unnecessary, because the same program is managing
-///  generation numbers and doing attachments).
-async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let tenant_state = locked
-        .tenants
-        .entry(attach_req.tenant_id)
-        .or_insert_with(|| TenantState {
-            pageserver: attach_req.pageserver_id,
-            generation: 0,
-        });
-
-    if attach_req.pageserver_id.is_some() {
-        tenant_state.generation += 1;
-    }
-    let generation = tenant_state.generation;
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(
-        StatusCode::OK,
-        AttachHookResponse {
-            gen: attach_req.pageserver_id.map(|_| generation),
-        },
-    )
-}
-
-fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router()
-        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", handle_re_attach)
-        .post("/validate", handle_validate)
-        .post("/attach_hook", handle_attach_hook)
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    logging::init(
-        LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-    )?;
-
-    let args = Cli::parse();
-    tracing::info!(
-        "Starting, state at {}, listening on {}",
-        args.path.to_string_lossy(),
-        args.listen
-    );
-
-    let persistent_state = PersistentState::load_or_new(&args.path).await;
-
-    let http_listener = tcp_listener::bind(args.listen)?;
-    let router = make_router(persistent_state)
-        .build()
-        .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
-
-    tracing::info!("Serving on {0}", args.listen);
-    server.await?;
-
-    Ok(())
-}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,7 +8,6 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
@@ -44,18 +43,14 @@ project_git_version!(GIT_VERSION);

 const DEFAULT_PG_VERSION: &str = "15";

-const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
-
 fn default_conf() -> String {
    format!(
        r#"
 # Default built-in configuration, defined in main.rs
-control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
-
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[[pageservers]]
+[pageserver]
 id = {DEFAULT_PAGESERVER_ID}
 listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
 listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
@@ -66,7 +61,6 @@ http_auth_type = '{trust_auth}'
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
-
 "#,
        trust_auth = AuthType::Trust,
    )
@@ -113,7 +107,6 @@ fn main() -> Result<()> {
            "start" => handle_start_all(sub_args, &env),
            "stop" => handle_stop_all(sub_args, &env),
            "pageserver" => handle_pageserver(sub_args, &env),
-            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
@@ -259,7 +252,7 @@ fn get_timeline_infos(
    env: &local_env::LocalEnv,
    tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
-    Ok(get_default_pageserver(env)
+    Ok(PageServerNode::from_env(env)
        .timeline_list(tenant_id)?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -320,30 +313,17 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
-    for ps_conf in &env.pageservers {
-        PageServerNode::from_env(&env, ps_conf)
-            .initialize(&pageserver_config_overrides(init_match))
-            .unwrap_or_else(|e| {
-                eprintln!("pageserver init failed: {e:?}");
-                exit(1);
-            });
-    }
+    let pageserver = PageServerNode::from_env(&env);
+    pageserver
+        .initialize(&pageserver_config_overrides(init_match))
+        .unwrap_or_else(|e| {
+            eprintln!("pageserver init failed: {e:?}");
+            exit(1);
+        });

    Ok(env)
 }

-/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
-/// For typical interactive use, one would just run with a single pageserver.  Scenarios with
-/// tenant/timeline placement across multiple pageservers are managed by python test code rather
-/// than this CLI.
-fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
-    let ps_conf = env
-        .pageservers
-        .first()
-        .expect("Config is validated to contain at least one pageserver");
-    PageServerNode::from_env(env, ps_conf)
-}
-
 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
        .get_many::<String>("pageserver-config-override")
@@ -354,7 +334,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
-    let pageserver = get_default_pageserver(env);
+    let pageserver = PageServerNode::from_env(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
            for t in pageserver.tenant_list()? {
@@ -362,25 +342,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
            }
        }
        Some(("create", create_match)) => {
+            let initial_tenant_id = parse_tenant_id(create_match)?;
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();
-
-            // If tenant ID was not specified, generate one
-            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
-
-            let generation = if env.control_plane_api.is_some() {
-                // We must register the tenant with the attachment service, so
-                // that when the pageserver restarts, it will be re-attached.
-                let attachment_service = AttachmentService::from_env(env);
-                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
-            } else {
-                None
-            };
-
-            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
-            println!("tenant {tenant_id} successfully created on the pageserver");
+            let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
+            println!("tenant {new_tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
            let new_timeline_id = parse_timeline_id(create_match)?;
@@ -390,7 +358,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .context("Failed to parse postgres version from the argument string")?;

            let timeline_info = pageserver.timeline_create(
-                tenant_id,
+                new_tenant_id,
                new_timeline_id,
                None,
                None,
@@ -401,17 +369,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
-                tenant_id,
+                new_tenant_id,
                new_timeline_id,
            )?;

            println!(
-                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
+                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
            );

            if create_match.get_flag("set-default") {
-                println!("Setting tenant {tenant_id} as a default one");
-                env.default_tenant_id = Some(tenant_id);
+                println!("Setting tenant {new_tenant_id} as a default one");
+                env.default_tenant_id = Some(new_tenant_id);
            }
        }
        Some(("set-default", set_default_match)) => {
@@ -439,7 +407,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
 }

 fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let pageserver = get_default_pageserver(env);
+    let pageserver = PageServerNode::from_env(env);

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
@@ -516,7 +484,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                pg_version,
                ComputeMode::Primary,
-                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -570,6 +537,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
        Some(ep_subcommand_data) => ep_subcommand_data,
        None => bail!("no endpoint subcommand provided"),
    };
+
    let mut cplane = ComputeControlPlane::load(env.clone())?;

    // All subcommands take an optional --tenant-id option
@@ -666,13 +634,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
-                } else {
-                    DEFAULT_PAGESERVER_ID
-                };
-
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -688,7 +649,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                http_port,
                pg_version,
                mode,
-                pageserver_id,
            )?;
        }
        "start" => {
@@ -698,13 +658,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
-                } else {
-                    DEFAULT_PAGESERVER_ID
-                };
-
            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

            // If --safekeepers argument is given, use only the listed safekeeper nodes.
@@ -724,8 +677,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(

            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

-            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
-            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
+            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
@@ -792,7 +744,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    http_port,
                    pg_version,
                    mode,
-                    pageserver_id,
                )?;
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
@@ -817,94 +768,51 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
 }

 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
-        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
-            NodeId(id_str.parse().context("while parsing pageserver id")?)
-        } else {
-            DEFAULT_PAGESERVER_ID
-        };
-
-        Ok(PageServerNode::from_env(
-            env,
-            env.get_pageserver_conf(node_id)?,
-        ))
-    }
+    let pageserver = PageServerNode::from_env(env);

    match sub_match.subcommand() {
-        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args))
-            {
+        Some(("start", start_match)) => {
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

-        Some(("stop", subcommand_args)) => {
-            let immediate = subcommand_args
-                .get_one::<String>("stop-mode")
-                .map(|s| s.as_str())
-                == Some("immediate");
-
-            if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-        }
-
-        Some(("restart", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("status", subcommand_args)) => {
-            match get_pageserver(env, subcommand_args)?.check_status() {
-                Ok(_) => println!("Page server is up and running"),
-                Err(err) => {
-                    eprintln!("Page server is not available: {}", err);
-                    exit(1);
-                }
-            }
-        }
-
-        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
-        None => bail!("no pageserver subcommand provided"),
-    }
-    Ok(())
-}
-
-fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let svc = AttachmentService::from_env(env);
-    match sub_match.subcommand() {
-        Some(("start", _start_match)) => {
-            if let Err(e) = svc.start() {
-                eprintln!("start failed: {e}");
-                exit(1);
-            }
-        }
-
        Some(("stop", stop_match)) => {
            let immediate = stop_match
                .get_one::<String>("stop-mode")
                .map(|s| s.as_str())
                == Some("immediate");

-            if let Err(e) = svc.stop(immediate) {
-                eprintln!("stop failed: {}", e);
+            if let Err(e) = pageserver.stop(immediate) {
+                eprintln!("pageserver stop failed: {}", e);
                exit(1);
            }
        }
-        Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
-        None => bail!("no attachment_service subcommand provided"),
+
+        Some(("restart", restart_match)) => {
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
+        Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
+            Ok(_) => println!("Page server is up and running"),
+            Err(err) => {
+                eprintln!("Page server is not available: {}", err);
+                exit(1);
+            }
+        },
+
+        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
+        None => bail!("no pageserver subcommand provided"),
    }
    Ok(())
 }
@@ -989,23 +897,11 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    broker::start_broker_process(env)?;

-    // Only start the attachment service if the pageserver is configured to need it
-    if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start() {
-            eprintln!("attachment_service start failed: {:#}", e);
-            try_stop_all(env, true);
-            exit(1);
-        }
-    }
-
-    for ps_conf in &env.pageservers {
-        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true);
-            exit(1);
-        }
+    let pageserver = PageServerNode::from_env(env);
+    if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
+        try_stop_all(env, true);
+        exit(1);
    }

    for node in env.safekeepers.iter() {
@@ -1029,6 +925,8 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
 }

 fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+    let pageserver = PageServerNode::from_env(env);
+
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
@@ -1043,11 +941,8 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    for ps_conf in &env.pageservers {
-        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.stop(immediate) {
-            eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e);
-        }
+    if let Err(e) = pageserver.stop(immediate) {
+        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
    }

    for node in env.safekeepers.iter() {
@@ -1060,13 +955,6 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    if let Err(e) = broker::stop_broker_process(env) {
        eprintln!("neon broker stop failed: {e:#}");
    }
-
-    if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate) {
-            eprintln!("attachment service stop failed: {e:#}");
-        }
-    }
 }

 fn cli() -> Command {
@@ -1081,16 +969,6 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

-    // --id, when using a pageserver command
-    let pageserver_id_arg = Arg::new("pageserver-id")
-        .long("id")
-        .help("pageserver id")
-        .required(false);
-    // --pageserver-id when using a non-pageserver command
-    let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id")
-        .long("pageserver-id")
-        .required(false);
-
    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
        .short('e')
        .long("safekeeper-extra-opt")
@@ -1255,24 +1133,10 @@ fn cli() -> Command {
                .arg_required_else_help(true)
                .about("Manage pageserver")
                .subcommand(Command::new("status"))
-                .arg(pageserver_id_arg.clone())
-                .subcommand(Command::new("start").about("Start local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
-                .arg(pageserver_id_arg.clone())
-                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
-        )
-        .subcommand(
-            Command::new("attachment_service")
-                .arg_required_else_help(true)
-                .about("Manage attachment_service")
                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
        )
        .subcommand(
            Command::new("safekeeper")
@@ -1308,7 +1172,6 @@ fn cli() -> Command {
                    .arg(lsn_arg.clone())
                    .arg(pg_port_arg.clone())
                    .arg(http_port_arg.clone())
-                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1326,7 +1189,6 @@ fn cli() -> Command {
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
-                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -70,7 +70,6 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
-    pageserver_id: NodeId,
 }

 //
@@ -83,16 +82,19 @@ pub struct ComputeControlPlane {
    pub endpoints: BTreeMap<String, Arc<Endpoint>>,

    env: LocalEnv,
+    pageserver: Arc<PageServerNode>,
 }

 impl ComputeControlPlane {
    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
+        let pageserver = Arc::new(PageServerNode::from_env(&env));
+
        let mut endpoints = BTreeMap::default();
        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
+            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

@@ -100,6 +102,7 @@ impl ComputeControlPlane {
            base_port: 55431,
            endpoints,
            env,
+            pageserver,
        })
    }

@@ -122,29 +125,20 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
-        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
-        let pageserver =
-            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver,
+            pageserver: Arc::clone(&self.pageserver),
            timeline_id,
            mode,
            tenant_id,
            pg_version,
-            // We don't setup roles and databases in the spec locally, so we don't need to
-            // do catalog updates. Catalog updates also include check availability
-            // data creation. Yet, we have tests that check that size and db dump
-            // before and after start are the same. So, skip catalog updates,
-            // with this we basically test a case of waking up an idle compute, where
-            // we also skip catalog updates in the cloud.
-            skip_pg_catalog_updates: true,
+            skip_pg_catalog_updates: false,
        });

        ep.create_endpoint_dir()?;
@@ -158,8 +152,7 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
-                skip_pg_catalog_updates: true,
-                pageserver_id,
+                skip_pg_catalog_updates: false,
            })?,
        )?;
        std::fs::write(
@@ -194,14 +187,18 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: PageServerNode,
+    pageserver: Arc<PageServerNode>,

    // Optimizations
    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
-    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
+    fn from_dir_entry(
+        entry: std::fs::DirEntry,
+        env: &LocalEnv,
+        pageserver: &Arc<PageServerNode>,
+    ) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
            anyhow::bail!(
                "Endpoint::from_dir_entry failed: '{}' is not a directory",
@@ -217,15 +214,12 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        let pageserver =
-            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
-
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver,
+            pageserver: Arc::clone(pageserver),
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -7,7 +7,6 @@
 // local installations.
 //

-pub mod attachment_service;
 mod background_process;
 pub mod broker;
 pub mod endpoint;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -68,17 +68,11 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    /// This Vec must always contain at least one pageserver
-    pub pageservers: Vec<PageServerConf>,
+    pub pageserver: PageServerConf,

    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,

-    // Control plane location: if None, we will not run attachment_service.  If set, this will
-    // be propagated into each pageserver's configuration.
-    #[serde(default)]
-    pub control_plane_api: Option<Url>,
-
    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
    #[serde(default)]
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
@@ -182,28 +176,32 @@ impl LocalEnv {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
+        match pg_version {
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
+        }
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
+        match pg_version {
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
+        }
    }

    pub fn pageserver_bin(&self) -> PathBuf {
        self.neon_distrib_dir.join("pageserver")
    }

-    pub fn attachment_service_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("attachment_service")
-    }
-
    pub fn safekeeper_bin(&self) -> PathBuf {
        self.neon_distrib_dir.join("safekeeper")
    }
@@ -216,23 +214,15 @@ impl LocalEnv {
        self.base_data_dir.join("endpoints")
    }

-    pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf {
-        self.base_data_dir
-            .join(format!("pageserver_{pageserver_id}"))
+    // TODO: move pageserver files into ./pageserver
+    pub fn pageserver_data_dir(&self) -> PathBuf {
+        self.base_data_dir.clone()
    }

    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

-    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
-        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
-            Ok(conf)
-        } else {
-            bail!("could not find pageserver {id}")
-        }
-    }
-
    pub fn register_branch_mapping(
        &mut self,
        branch_name: String,
@@ -309,10 +299,6 @@ impl LocalEnv {
            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }

-        if env.pageservers.is_empty() {
-            anyhow::bail!("Configuration must contain at least one pageserver");
-        }
-
        env.base_data_dir = base_path();

        Ok(env)
@@ -345,7 +331,7 @@ impl LocalEnv {
        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
        // a bit sad.
-        let mut conf_content = r#"# This file describes a local deployment of the page server
+        let mut conf_content = r#"# This file describes a locale deployment of the page server
 # and safekeeeper node. It is read by the 'neon_local' command-line
 # utility.
 "#
@@ -475,9 +461,9 @@ impl LocalEnv {
    }

    fn auth_keys_needed(&self) -> bool {
-        self.pageservers.iter().any(|ps| {
-            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
-        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        self.pageserver.pg_auth_type == AuthType::NeonJWT
+            || self.pageserver.http_auth_type == AuthType::NeonJWT
+            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
    }
 }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -27,7 +27,6 @@ use utils::{
    lsn::Lsn,
 };

-use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
@@ -77,40 +76,43 @@ impl ResponseErrorMessageExt for Response {
 #[derive(Debug)]
 pub struct PageServerNode {
    pub pg_connection_config: PgConnectionConfig,
-    pub conf: PageServerConf,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
 }

 impl PageServerNode {
-    pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode {
-        let (host, port) =
-            parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+    pub fn from_env(env: &LocalEnv) -> PageServerNode {
+        let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
+            .expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);
        Self {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
-            conf: conf.clone(),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
+            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
        }
    }

    // pageserver conf overrides defined by neon_local configuration.
    fn neon_local_overrides(&self) -> Vec<String> {
-        let id = format!("id={}", self.conf.id);
+        let id = format!("id={}", self.env.pageserver.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
+        let http_auth_type_param =
+            format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
+        let listen_http_addr_param = format!(
+            "listen_http_addr='{}'",
+            self.env.pageserver.listen_http_addr
+        );

-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
+        let listen_pg_addr_param =
+            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -124,18 +126,10 @@ impl PageServerNode {
            broker_endpoint_param,
        ];

-        if let Some(control_plane_api) = &self.env.control_plane_api {
-            overrides.push(format!(
-                "control_plane_api='{}'",
-                control_plane_api.as_str()
-            ));
-        }
-
-        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
+        if self.env.pageserver.http_auth_type != AuthType::Trust
+            || self.env.pageserver.pg_auth_type != AuthType::Trust
        {
-            // Keys are generated in the toplevel repo dir, pageservers' workdirs
-            // are one level below that, so refer to keys with ../
-            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
+            overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
        }
        overrides
    }
@@ -143,12 +137,16 @@ impl PageServerNode {
    /// Initializes a pageserver node by creating its config with the overrides provided.
    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
-        self.pageserver_init(config_overrides)
-            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
+        self.pageserver_init(config_overrides).with_context(|| {
+            format!(
+                "Failed to run init for pageserver node {}",
+                self.env.pageserver.id,
+            )
+        })
    }

    pub fn repo_path(&self) -> PathBuf {
-        self.env.pageserver_data_dir(self.conf.id)
+        self.env.pageserver_data_dir()
    }

    /// The pid file is created by the pageserver process, with its pid stored inside.
@@ -164,7 +162,7 @@ impl PageServerNode {

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        let datadir = self.repo_path();
-        let node_id = self.conf.id;
+        let node_id = self.env.pageserver.id;
        println!(
            "Initializing pageserver node {} at '{}' in {:?}",
            node_id,
@@ -173,10 +171,6 @@ impl PageServerNode {
        );
        io::stdout().flush()?;

-        if !datadir.exists() {
-            std::fs::create_dir(&datadir)?;
-        }
-
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
        })?;
@@ -207,7 +201,7 @@ impl PageServerNode {
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
-            self.conf.id,
+            self.env.pageserver.id,
            self.pg_connection_config.raw_address(),
            datadir
        );
@@ -216,7 +210,7 @@ impl PageServerNode {
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!(
                "Cannot start pageserver node {} in path that has no string representation: {:?}",
-                self.conf.id, datadir,
+                self.env.pageserver.id, datadir,
            )
        })?;
        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
@@ -260,7 +254,7 @@ impl PageServerNode {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
        // the pageserver requires a token in incoming requests.
-        Ok(if self.conf.http_auth_type != AuthType::Trust {
+        Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
            // Generate a token to connect from the pageserver to a safekeeper
            let token = self
                .env
@@ -285,7 +279,7 @@ impl PageServerNode {

    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
        let mut config = self.pg_connection_config.clone();
-        if self.conf.pg_auth_type == AuthType::NeonJWT {
+        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -296,7 +290,7 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
        let mut builder = self.http_client.request(method, url);
-        if self.conf.http_auth_type == AuthType::NeonJWT {
+        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -322,8 +316,7 @@ impl PageServerNode {

    pub fn tenant_create(
        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
+        new_tenant_id: Option<TenantId>,
        settings: HashMap<&str, &str>,
    ) -> anyhow::Result<TenantId> {
        let mut settings = settings.clone();
@@ -389,9 +382,11 @@ impl PageServerNode {
                .context("Failed to parse 'gc_feedback' as bool")?,
        };

+        // If tenant ID was not specified, generate one
+        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
+
        let request = models::TenantCreateRequest {
            new_tenant_id,
-            generation,
            config,
        };
        if !settings.is_empty() {
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,7 +30,7 @@ cleanup() {
 echo "clean up containers if exists"
 cleanup

-for pg_version in 14 15 16; do
+for pg_version in 14 15; do
    echo "start containers (pg_version=$pg_version)."
    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

--- a/docs/rfcs/025-generation-numbers.md
+++ b/docs/rfcs/025-generation-numbers.md
@@ -1,957 +0,0 @@
-# Pageserver: split-brain safety for remote storage through generation numbers
-
-## Summary
-
-A scheme of logical "generation numbers" for tenant attachment to pageservers is proposed, along with
-changes to the remote storage format to include these generation numbers in S3 keys.
-
-Using the control plane as the issuer of these generation numbers enables strong anti-split-brain
-properties in the pageserver cluster without implementing a consensus mechanism directly
-in the pageservers.
-
-## Motivation
-
-Currently, the pageserver's remote storage format does not provide a mechanism for addressing
-split brain conditions that may happen when replacing a node or when migrating
-a tenant from one pageserver to another.
-
-From a remote storage perspective, a split brain condition occurs whenever two nodes both think
-they have the same tenant attached, and both can write to S3. This can happen in the case of a
-network partition, pathologically long delays (e.g. suspended VM), or software bugs.
-
-In the current deployment model, control plane guarantees that a tenant is attached to one
-pageserver at a time, thereby ruling out split-brain conditions resulting from dual
-attachment (however, there is always the risk of a control plane bug). This control
-plane guarantee prevents robust response to failures, as if a pageserver is unresponsive
-we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
-attach to a new, different pageserver even if an unresponsive pageserver may be running.
-
-Futher, lack of safety during split-brain conditions blocks two important features where occasional
-split-brain conditions are part of the design assumptions:
-
- seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
- automatic pageserver instance failure handling (aka "failover") (RFC TBD)
-
-### Prior art
-
- 020-pageserver-s3-coordination.md
- 023-the-state-of-pageserver-tenant-relocation.md
- 026-pageserver-s3-mvcc.md
-
-This RFC has broad similarities to the proposal to implement a MVCC scheme in
-S3 object names, but this RFC avoids a general purpose transaction scheme in
-favour of more specialized "generations" that work like a transaction ID that
-always has the same lifetime as a pageserver process or tenant attachment, whichever
-is shorter.
-
-## Requirements
-
- Accommodate storage backends with no atomic or fencing capability (i.e. work within
-  S3's limitation that there are no atomics and clients can't be fenced)
- Don't depend on any STONITH or node fencing in the compute layer (i.e. we will not
-  assume that we can reliably kill and EC2 instance and have it die)
- Scoped per-tenant, not per-pageserver; for _seamless tenant migration_, we need
-  per-tenant granularity, and for _failover_, we likely want to spread the workload
-  of the failed pageserver instance to a number of peers, rather than monolithically
-  moving the entire workload to another machine.
-  We do not rule out the latter case, but should not constrain ourselves to it.
-
-## Design Tenets
-
-These are not requirements, but are ideas that guide the following design:
-
- Avoid implementing another consensus system: we already have a strongly consistent
-  database in the control plane that can do atomic operations where needed, and we also
-  have a Paxos implementation in the safekeeper.
- Avoiding locking in to specific models of how failover will work (e.g. do not assume that
-  all the tenants on a pageserver will fail over as a unit).
- Be strictly correct when it comes to data integrity. Occasional failures of availability
-  are tolerable, occasional data loss is not.
-
-## Non Goals
-
-The changes in this RFC intentionally isolate the design decision of how to define
-logical generations numbers and object storage format in a way that is somewhat flexible with
-respect to how actual orchestration of failover works.
-
-This RFC intentionally does not cover:
-
- Failure detection
- Orchestration of failover
- Standby modes to keep data ready for fast migration
- Intentional multi-writer operation on tenants (multi-writer scenarios are assumed to be transient split-brain situations).
- Sharding.
-
-The interaction between this RFC and those features is discussed in [Appendix B](#appendix-b-interoperability-with-other-features)
-
-## Impacted Components
-
-pageserver, control plane, safekeeper (optional)
-
-## Implementation Part 1: Correctness
-
-### Summary
-
- A per-tenant **generation number** is introduced to uniquely identifying tenant attachments to pageserver processes.
-
-  - This generation number increments each time the control plane modifies a tenant (`Project`)'s assigned pageserver, or when the assigned pageserver restarts.
-  - the control plane is the authority for generation numbers: only it may
-    increment a generation number.
-
- **Object keys are suffixed** with the generation number
- **Safety for multiply-attached tenants** is provided by the
-  generation number in the object key: the competing pageservers will not
-  try to write to the same keys.
- **Safety in split brain for multiple nodes running with
-  the same node ID** is provided by the pageserver calling out to the control plane
-  on startup, to re-attach and thereby increment the generations of any attached tenants
- **Safety for deletions** is achieved by deferring the DELETE from S3 to a point in time where the deleting node has validated with control plane that no attachment with a higher generation has a reference to the to-be-DELETEd key.
- **The control plane is used to issue generation numbers** to avoid the need for
-  a built-in consensus system in the pageserver, although this could in principle
-  be changed without changing the storage format.
-
-### Generation numbers
-
-A generation number is associated with each tenant in the control plane,
-and each time the attachment status of the tenant changes, this is incremented.
-Changes in attachment status include:
-
- Attaching the tenant to a different pageserver
- A pageserver restarting, and "re-attaching" its tenants on startup
-
-These increments of attachment generation provide invariants we need to avoid
-split-brain issues in storage:
-
- If two pageservers have the same tenant attached, the attachments are guaranteed to have different generation numbers, because the generation would increment
-  while attaching the second one.
- If there are multiple pageservers running with the same node ID, all the attachments on all pageservers are guaranteed to have different generation numbers, because the generation would increment
-  when the second node started and re-attached its tenants.
-
-As long as the infrastructure does not transparently replace an underlying
-physical machine, we are totally safe. See the later [unsafe case](#unsafe-case-on-badly-behaved-infrastructure) section for details.
-
-### Object Key Changes
-
-#### Generation suffix
-
-All object keys (layer objects and index objects) will contain the attachment
-generation as a [suffix](#why-a-generation-suffix-rather-than-prefix).
-This suffix is the primary mechanism for protecting against split-brain situations, and
-enabling safe multi-attachment of tenants:
-
- Two pageservers running with the same node ID (e.g. after a failure, where there is
-  some rogue pageserver still running) will not try to write to the same objects, because at startup they will have re-attached tenants and thereby incremented
-  generation numbers.
- Multiple attachments (to different pageservers) of the same tenant will not try to write to the same objects, as each attachment would have a distinct generation.
-
-The generation is appended in hex format (8 byte string representing
-u32), to all our existing key names. A u32's range limit would permit
-27 restarts _per second_ over a 5 year system lifetime: orders of magnitude more than
-is realistic.
-
-The exact meaning of the generation suffix can evolve over time if necessary, for
-example if we chose to implement a failover mechanism internally to the pageservers
-rather than going via the control plane. The storage format just sees it as a number,
-with the only semantic property being that the highest numbered index is the latest.
-
-#### Index changes
-
-Since object keys now include a generation suffix, the index of these keys must also be updated. IndexPart currently stores keys and LSNs sufficient to reconstruct key names: this would be extended to store the generation as well.
-
-This will increase the size of the file, but only modestly: layers are already encoded as
-their string-ized form, so the overhead is about 10 bytes per layer. This will be less if/when
-the index storage format is migrated to a binary format from JSON.
-
-#### Visibility
-
-_This section doesn't describe code changes, but extends on the consequences of the
-object key changes given above_
-
-##### Visibility of objects to pageservers
-
-Pageservers can of course list objects in S3 at any time, but in practice their
-visible set is based on the contents of their LayerMap, which is initialized
-from the `index_part.json.???` that they load.
-
-Starting with the `index_part` from the most recent previous generation
-(see [loading index_part](#finding-the-remote-indices-for-timelines)), a pageserver
-initially has visibility of all the objects that were referenced in the loaded index.
-These objects are guaranteed to remain visible until the current generation is
-superseded, via pageservers in older generations avoiding deletions (see [deletion](#deletion)).
-
-The "most recent previous generation" is _not_ necessarily the most recent
-in terms of walltime, it is the one that is readable at the time a new generation
-starts. Consider the following sequence of a tenant being re-attached to different
-pageserver nodes:
-
- Create + attach on PS1 in generation 1
- PS1 Do some work, write out index_part.json-0001
- Attach to PS2 in generation 2
- Read index_part.json-0001
- PS2 starts doing some work...
- Attach to PS3 in generation 3
- Read index_part.json-0001
- **...PS2 finishes its work: now it writes index_part.json-0002**
- PS3 writes out index_part.json-0003
-
-In the above sequence, the ancestry of indices is:
-
-```
-0001 -> 0002
-     |
-     -> 0003
-```
-
-This is not an issue for safety: if the 0002 references some object that is
-not in 0001, then 0003 simply does not see it, and will re-do whatever
-work was required (e.g. ingesting WAL or doing compaction). Objects referenced
-by only the 0002 index will never be read by future attachment generations, and
-will eventually be cleaned up by a scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)).
-
-##### Visibility of LSNs to clients
-
-Because index_part.json is now written with a generation suffix, which data
-is visible depends on which generation the reader is operating in:
-
- If one was passively reading from S3 from outside of a pageserver, the
-  visibility of data would depend on which index_part.json-<generation> file
-  one had chosen to read from.
- If two pageservers have the same tenant attached, they may have different
-  data visible as they're independently replaying the WAL, and maintaining
-  independent LayerMaps that are written to independent index_part.json files.
-  Data does not have to be remotely committed to be visible.
- For a pageserver writing with a stale generation, historic LSNs
-  remain readable until another pageserver (with a higher generation suffix)
-  decides to execute GC deletions. At this point, we may think of the stale
-  attachment's generation as having logically ended: during its existence
-  the generation had a consistent view of the world.
- For a newly attached pageserver, its highest visible LSN may appears to
-  go backwards with respect to an earlier attachment, if that earlier
-  attachment had not uploaded all data to S3 before the new attachment.
-
-### Deletion
-
-#### Generation number validation
-
-While writes are de-conflicted by writers always using their own generation number in the key,
-deletions are slightly more challenging: if a pageserver A is isolated, and the true active node is
-pageserver B, then it is dangerous for A to do any object deletions, even of objects that it wrote
-itself, because pageserver's B metadata might reference those objects.
-
-We solve this by inserting a "generation validation" step between the write of a remote index
-that un-links a particular object from the index, and the actual deletion of the object, such
-that deletions strictly obey the following ordering:
-
-1. Write out index_part.json: this guarantees that any subsequent reader of the metadata will
-   not try and read the object we unlinked.
-2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
-3. If step 2 passes, it is safe to delete the object. Why? The check-in with control plane
-   together with our visibility rules guarantees that any later generation
-   will use either the exact `index_part.json` that we uploaded in step 1, or a successor
-   of it; not an earlier one. In both cases, the `index_part.json` doesn't reference the
-   key we are deleting anymore, so, the key is invisible to any later attachment generation.
-   Hence it's safe to delete it.
-
-Note that at step 2 we are only confirming that deletions of objects _no longer referenced
-by the specific `index_part.json` written in step 1_ are safe. If we were attempting other deletions concurrently,
-these would need their own generation validation step.
-
-If step 2 fails, we may leak the object. This is safe, but has a cost: see [scrubbing](#cleaning-up-orphan-objects-scrubbing). We may avoid this entirely outside of node
-failures, if we do proper flushing of deletions on clean shutdown and clean migration.
-
-To avoid doing a huge number of control plane requests to perform generation validation,
-validation of many tenants will be done in a single request, and deletions will be queued up
-prior to validation: see [Persistent deletion queue](#persistent-deletion-queue) for more.
-
-#### `remote_consistent_lsn` updates
-
-Remote objects are not the only kind of deletion the pageserver does: it also indirectly deletes
-WAL data, by feeding back remote_consistent_lsn to safekeepers, as a signal to the safekeepers that
-they may drop data below this LSN.
-
-For the same reasons that deletion of objects must be guarded by an attachment generation number
-validation step, updates to `remote_consistent_lsn` are subject to the same rules, using
-an ordering as follows:
-
-1. upload the index_part that covers data up to LSN `L0` to S3
-2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
-3. advance the `remote_consistent_lsn` that we advertise to the safekeepers to `L0`
-
-If step 2 fails, then the `remote_consistent_lsn` advertised
-to safekeepers will not advance again until a pageserver
-with the latest generation is ready to do so.
-
-**Note:** at step 3 we are not advertising the _latest_ remote_consistent_lsn, we are
-advertising the value in the index_part that we uploaded in step 1. This provides
-a strong ordering guarantee.
-
-Internally to the pageserver, each timeline will have two remote_consistent_lsn values: the one that
-reflects its latest write to remote storage, and the one that reflects the most
-recent validation of generation number. It is only the latter value that may
-be advertised to the outside world (i.e. to the safekeeper).
-
-The control plane remains unaware of `remote_consistent_lsn`: it only has to validate
-the freshness of generation numbers, thereby granting the pageserver permission to
-share the information with the safekeeper.
-
-For convenience, in subsequent sections and RFCs we will use "deletion" to mean both deletion
-of objects in S3, and updates to the `remote_consistent_lsn`, as updates to the remote consistent
-LSN are de-facto deletions done via the safekeeper, and both kinds of deletion are subject to
-the same generation validation requirement.
-
-### Pageserver attach/startup changes
-
-#### Attachment
-
-Calls to `/v1/tenant/{tenant_id}/attach` are augmented with an additional
-`generation` field in the body.
-
-The pageserver does not persist this: a generation is only good for the lifetime
-of a process.
-
-#### Finding the remote indices for timelines
-
-Because index files are now suffixed with generation numbers, the pageserver
-cannot always GET the remote index in one request, because it can't always
-know a-priori what the latest remote index is.
-
-Typically, the most recent generation to write an index would be our own
-generation minus 1. However, this might not be the case: the previous
-node might have started and acquired a generation number, and then crashed
-before writing out a remote index.
-
-In the general case and as a fallback, the pageserver may list all the `index_part.json`
-files for a timeline, sort them by generation, and pick the highest that is `<=`
-its current generation for this attachment. The tenant should never load an index
-with an attachment generation _newer_ than its own.
-These two rules combined ensure that objects written by later generations are never visible to earlier generations.
-
-Note that if a given attachment picks an index part from an earlier generation (say n-2), but crashes & restarts before it writes its own generation's index part, next time it tries to pick an index part there may be an index part from generation n-1.
-It would pick the n-1 index part in that case, because it's sorted higher than the previous one from generation n-2.
-So, above rules guarantee no determinism in selecting the index part.
-are allowed to be attached with stale attachment generations during a multiply-attached
-phase in a migration, and in this instance if the old location's pageserver restarts,
-it should not try and load the newer generation's index.
-
-To summarize, on starting a timeline, the pageserver will:
-
-1. Issue a GET for index_part.json-<my generation - 1>
-2. If 1 failed, issue a ListObjectsv2 request for index_part.json\* and
-   pick the newest.
-
-One could optimize this further by using the control plane to record specifically
-which generation most recently wrote an index_part.json, if necessary, to increase
-the probability of finding the index_part.json in one GET. One could also improve
-the chances by having pageservers proactively write out index_part.json after they
-get a new generation ID.
-
-#### Re-attachment on startup
-
-On startup, the pageserver will call out to an new control plane `/re-attach`
-API (see [Generation API](#generation-api)). This returns a list of
-tenants that should be attached to the pageserver, and their generation numbers, which
-the control plane will increment before returning.
-
-The pageserver should still scan its local disk on startup, but should _delete_
-any local content for tenants not indicated in the `/re-attach` response: their
-absence is an implicit detach operation.
-
-**Note** if a tenant is omitted from the re-attach response, its local disk content
-will be deleted. This will change in subsequent work, when the control plane gains
-the concept of a secondary/standby location: a node with local content may revert
-to this status and retain some local content.
-
-#### Cleaning up previous generations' remote indices
-
-Deletion of old indices is not necessary for correctness, although it is necessary
-to avoid the ListObjects fallback in the previous section becoming ever more expensive.
-
-Once the new attachment has written out its index_part.json, it may asynchronously clean up historic index_part.json
-objects that were found.
-
-We may choose to implement this deletion either as an explicit step after we
-write out index_part for the first time in a pageserver's lifetime, or for
-simplicity just do it periodically as part of the background scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing));
-
-### Control Plane Changes
-
-#### Store generations for attaching tenants
-
- The `Project` table must store the generation number for use when
-  attaching the tenant to a new pageserver.
- The `/v1/tenant/:tenant_id/attach` pageserver API will require the generation number,
-  which the control plane can supply by simply incrementing the `Project`'s
-  generation number each time the tenant is attached to a different server: the same database
-  transaction that changes the assigned pageserver should also change the generation number.
-
-#### Generation API
-
-This section describes an API that could be provided directly by the control plane,
-or built as a separate microservice. In earlier parts of the RFC, when we
-discuss the control plane providing generation numbers, we are referring to this API.
-
-The API endpoints used by the pageserver to acquire and validate generation
-numbers are quite simple, and only require access to some persistent and
-linerizable storage (such as a database).
-
-Building this into the control plane is proposed as a least-effort option to exploit existing infrastructure and implement generation number issuance in the same transaction that mandates it (i.e., the transaction that updates the `Project` assignment to another pageserver).
-However, this is not mandatory: this "Generation Number Issuer" could
-be built as a microservice. In practice, we will write such a miniature service
-anyway, to enable E2E pageserver/compute testing without control plane.
-
-The endpoints required by pageservers are:
-
-##### `/re-attach`
-
- Request: `{node_id: <u32>}`
- Response:
-  - 200 `{tenants: [{id: <TenantId>, gen: <u32>}]}`
-  - 404: unknown node_id
-  - (Future: 429: flapping detected, perhaps nodes are fighting for the same node ID,
-    or perhaps this node was in a retry loop)
-  - (On unknown tenants, omit tenant from `tenants` array)
- Server behavior: query database for which tenants should be attached to this pageserver.
-  - for each tenant that should be attached, increment the attachment generation and
-    include the new generation in the response
- Client behavior:
-  - for all tenants in the response, activate with the new generation number
-  - for any local disk content _not_ referenced in the response, act as if we
-    had been asked to detach it (i.e. delete local files)
-
-**Note** the `node_id` in this request will change in future if we move to ephemeral
-node IDs, to be replaced with some correlation ID that helps the control plane realize
-if a process is running with the same storage as a previous pageserver process (e.g.
-we might use EC instance ID, or we might just write some UUID to the disk the first
-time we use it)
-
-##### `/validate`
-
- Request: `{'tenants': [{tenant: <tenant id>, attach_gen: <gen>}, ...]}'`
- Response:
-  - 200 `{'tenants': [{tenant: <tenant id>, status: <bool>}...]}`
-  - (On unknown tenants, omit tenant from `tenants` array)
- Purpose: enable the pageserver to discover for the given attachments whether they are still the latest.
- Server behavior: this is a read-only operation: simply compare the generations in the request with
-  the generations known to the server, and set status to `true` if they match.
- Client behavior: clients must not do deletions within a tenant's remote data until they have
-  received a response indicating the generation they hold for the attachment is current.
-
-#### Use of `/load` and `/ignore` APIs
-
-Because the pageserver will be changed to only attach tenants on startup
-based on the control plane's response to a `/re-attach` request, the load/ignore
-APIs no longer make sense in their current form.
-
-The `/load` API becomes functionally equivalent to attach, and will be removed:
-any location that used `/load` before should just attach instead.
-
-The `/ignore` API is equivalent to detaching, but without deleting local files.
-
-### Timeline/Branch creation & deletion
-
-All of the previous arguments for safety have described operations within
-a timeline, where we may describe a sequence that includes updates to
-index_part.json, and where reads and writes are coming from a postgres
-endpoint (writes via the safekeeper).
-
-Creating or destroying timeline is a bit different, because writes
-are coming from the control plane.
-
-We must be safe against scenarios such as:
-
- A tenant is attached to pageserver B while pageserver A is
-  in the middle of servicing an RPC from the control plane to
-  create or delete a tenant.
- A pageserver A has been sent a timeline creation request
-  but becomes unresponsive. The tenant is attached to a
-  different pageserver B, and the timeline creation request
-  is sent there too.
-
-#### Timeline Creation
-
-If some very slow node tries to do a timeline creation _after_
-a more recent generation node has already created the timeline
-and written some data into it, that must not cause harm. This
-is provided in timeline creations by the way all the objects
-within the timeline's remote path include a generation suffix:
-a slow node in an old generation that attempts to "create" a timeline
-that already exists will just emit an index_part.json with
-an old generation suffix.
-
-Timeline IDs are never reused, so we don't have
-to worry about the case of create/delete/create cycles. If they
-were re-used during a disaster recovery "un-delete" of a timeline,
-that special case can be handled by calling out to all available pageservers
-to check that they return 404 for the timeline, and to flush their
-deletion queues in case they had any deletions pending from the
-timeline.
-
-The above makes it safe for control plane to change the assignment of
-tenant to pageserver in control plane while a timeline creation is ongoing.
-The reason is that the creation request against the new assigned pageserver
-uses a new generation number. However, care must be taken by control plane
-to ensure that a "timeline creation successul" response from some pageserver
-is checked for the pageserver's generation for that timeline's tenant still being the latest.
-If it is not the latest, the response does not constitute a successful timeline creation.
-It is acceptable to discard such responses, the scrubber will clean up the S3 state.
-It is better to issue a timelien deletion request to the stale attachment.
-
-#### Timeline Deletion
-
-Tenant/timeline deletion operations are exempt from generation validation
-on deletes, and therefore don't have to go through the same deletion
-queue as GC/compaction layer deletions. This is because once a
-delete is issued by the control plane, it is a promise that the
-control plane will keep trying until the deletion is done, so even stale
-pageservers are permitted to go ahead and delete the objects.
-
-The implications of this for control plane are:
-
- During timeline/tenant deletion, the control plane must wait for the deletion to
-  be truly complete (status 404) and also handle the case where the pageserver
-  becomes unavailable, either by waiting for a replacement with the same node_id,
-  or by *re-attaching the tenant elsewhere.
-
- The control plane must persist its intent to delete
-  a timeline/tenant before issuing any RPCs, and then once it starts, it must
-  keep retrying until the tenant/timeline is gone. This is already handled
-  by using a persistent `Operation` record that is retried indefinitely.
-
-Timeline deletion may result in a special kind of object leak, where
-the latest generation attachment completes a deletion (including erasing
-all objects in the timeline path), but some slow/partitioned node is
-writing into the timeline path with a stale generation number. This would
-not be caught by any per-timeline scrubbing (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)), since scrubbing happens on the
-attached pageserver, and once the timeline is deleted it isn't attached anywhere.
-This scenario should be pretty rare, and the control plane can make it even
-rarer by ensuring that if a tenant is in a multi-attached state (e.g. during
-migration), we wait for that to complete before processing the deletion. Beyond
-that, we may implement some other top-level scrub of timelines in
-an external tool, to identify any tenant/timeline paths that are not found
-in the control plane database.
-
-#### Examples
-
- Deletion, node restarts partway through:
-  - By the time we returned 202, we have written a remote delete marker
-  - Any subsequent incarnation of the same node_id will see the remote
-    delete marker and continue to process the deletion
-  - If the original pageserver is lost permanently and no replacement
-    with the same node_id is available, then the control plane must recover
-    by re-attaching the tenant to a different node.
- Creation, node becomes unresponsive partway through.
-  - Control plane will see HTTP request timeout, keep re-issuing
-    request to whoever is the latest attachment point for the tenant
-    until it succeeds.
-  - Stale nodes may be trying to execute timeline creation: they will
-    write out index_part.json files with
-    stale attachment generation: these will be eventually cleaned up
-    by the same mechanism as other old indices.
-
-### Unsafe case on badly behaved infrastructure
-
-This section is only relevant if running on a different environment
-than EC2 machines with ephemeral disks.
-
-If we ever run pageservers on infrastructure that might transparently restart
-a pageserver while leaving an old process running (e.g. a VM gets rescheduled
-without the old one being fenced), then there is a risk of corruption, when
-the control plane attaches the tenant, as follows:
-
- If the control plane sends an `/attach` request to node A, then node A dies
-  and is replaced, and the control plane's retries the request without
-  incrementing that attachment ID, then it could end up with two physical nodes
-  both using the same generation number.
- This is not an issue when using EC2 instances with ephemeral storage, as long
-  as the control plane never re-uses a node ID, but it would need re-examining
-  if running on different infrastructure.
- To robustly protect against this class of issue, we would either:
-  - add a "node generation" to distinguish between different processes holding the
-    same node_id.
-  - or, dispense with static node_id entirely and issue an ephemeral ID to each
-    pageserver process when it starts.
-
-## Implementation Part 2: Optimizations
-
-### Persistent deletion queue
-
-Between writing our a new index_part.json that doesn't reference an object,
-and executing the deletion, an object passes through a window where it is
-only referenced in memory, and could be leaked if the pageserver is stopped
-uncleanly. That introduces conflicting incentives: on the one hand, we would
-like to delay and batch deletions to
-1. minimize the cost of the mandatory validations calls to control plane, and
-2. minimize cost for DeleteObjects requests.
-On the other hand we would also like to minimize leakage by executing
-deletions promptly.
-
-To resolve this, we may make the deletion queue persistent
-and then executing these in the background at a later time.
-
-_Note: The deletion queue's reason for existence is optimization rather than correctness,
-so there is a lot of flexibility in exactly how the it should work,
-as long as it obeys the rule to validate generations before executing deletions,
-so the following details are not essential to the overall RFC._
-
-#### Scope
-
-The deletion queue will be global per pageserver, not per-tenant. There
-are several reasons for this choice:
-
- Use the queue as a central point to coalesce validation requests to the
-  control plane: this avoids individual `Timeline` objects ever touching
-  the control plane API, and avoids them having to know the rules about
-  validating deletions. This separation of concerns will avoid burdening
-  the already many-LoC `Timeline` type with even more responsibility.
- Decouple the deletion queue from Tenant attachment lifetime: we may
-  "hibernate" an inactive tenant by tearing down its `Tenant`/`Timeline`
-  objects in the pageserver, without having to wait for deletions to be done.
- Amortize the cost of I/O for the persistent queue, instead of having many
-  tiny queues.
- Coalesce deletions into a smaller number of larger DeleteObjects calls
-
-Because of the cost of doing I/O for persistence, and the desire to coalesce
-generation validation requests across tenants, and coalesce deletions into
-larger DeleteObjects requests, there will be one deletion queue per pageserver
-rather than one per tenant. This has the added benefit that when deactivating
-a tenant, we do not have to drain their deletion queue: deletions can proceed
-for a tenant whose main `Tenant` object has been torn down.
-
-#### Flow of deletion
-
-The flow of a deletion is becomes:
-
-1. Need for deletion of an object (=> layer file) is identified.
-2. Unlink the object from all the places that reference it (=> `index_part.json`).
-3. Enqueue the deletion to a persistent queue.
-   Each entry is `tenant_id, attachment_generation, S3 key`.
-4. Validate & execute in batches:
-  4.1 For a batch of entries, call into control plane.
-  4.2 For the subset of entries that passed validation, execute a `DeleteObjects` S3 DELETE request for their S3 keys.
-
-As outlined in the Part 1 on correctness, it is critical that deletions are only
-executed once the key is not referenced anywhere in S3.
-This property is obviously upheld by the scheme above.
-
-#### We Accept Object Leakage In Acceptable Circumcstances
-
-If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
-Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
-This is acceptable for now, it can be caught by [the scrubber](#cleaning-up-orphan-objects-scrubbing).
-
-There are various measures we can take to improve this in the future.
-1. Cap amount of time until enqueued entry becomes durable (timeout for flush-to-tisk)
-2. Proactively flush:
-    - On graceful shutdown, as we anticipate that some or
-      all of our attachments may be re-assigned while we are offline.
-    - On tenant detach.
-3. For each entry, keep track of whether it has passed (2).
-   Only admit entries to (4) one they have passed (2).
-   This requires re-writing / two queue entries (intent, commit) per deletion.
-
-The important take-away with any of the above is that it's not
-disastrous to leak objects in exceptional circumstances.
-
-#### Operations that may skip the queue
-
-Deletions of an entire timeline are [exempt](#Timeline-Deletion) from generation number validation. Once the
-control plane sends the deletion request, there is no requirement to retain the readability
-of any data within the timeline, and all objects within the timeline path may be deleted
-at any time from the control plane's deletion request onwards.
-
-Since deletions of smaller timelines won't have enough objects to compose a full sized
-DeleteObjects request, it is still useful to send these through the last part of the
-deletion pipeline to coalesce with other executing deletions: to enable this, the
-deletion queue should expose two input channels: one for deletions that must be
-processed in a generation-aware way, and a fast path for timeline deletions, where
-that fast path may skip validation and the persistent queue.
-
-### Cleaning up orphan objects (scrubbing)
-
-An orphan object is any object which is no longer referenced by a running node or by metadata.
-
-Examples of how orphan objects arise:
-
- A node PUTs a layer object, then crashes before it writes the
-  index_part.json that references that layer.
- A stale node carries on running for some time, and writes out an unbounded number of
-  objects while it believes itself to be the rightful writer for a tenant.
- A pageserver crashes between un-linking an object from the index, and persisting
-  the object to its deletion queue.
-
-Orphan objects are functionally harmless, but have a small cost due to S3 capacity consumed. We
-may clean them up at some time in the future, but doing a ListObjectsv2 operation and cross
-referencing with the latest metadata to identify objects which are not referenced.
-
-Scrubbing will be done only by an attached pageserver (not some third party process), and deletions requested during scrub will go through the same
-validation as all other deletions: the attachment generation must be
-fresh. This avoids the possibility of a stale pageserver incorrectly
-thinking than an object written by a newer generation is stale, and deleting
-it.
-
-It is not strictly necessary that scrubbing be done by an attached
-pageserver: it could also be done externally. However, an external
-scrubber would still require the same validation procedure that
-a pageserver's deletion queue performs, before actually erasing
-objects.
-
-## Operational impact
-
-### Availability
-
-Coordination of generation numbers via the control plane introduce a dependency for certain
-operations:
-
-1. Starting new pageservers (or activating pageservers after a restart)
-2. Executing enqueued deletions
-3. Advertising updated `remote_consistent_lsn` to enable WAL trimming
-
-Item 1. would mean that some in-place restarts that previously would have resumed service even if the control plane were
-unavailable, will now not resume service to users until the control plane is available. We could
-avoid this by having a timeout on communication with the control plane, and after some timeout,
-resume service with the previous generation numbers (assuming this was persisted to disk). However,
-this is unlikely to be needed as the control plane is already an essential & highly available component. Also, having a node re-use an old generation number would complicate
-reasoning about the system, as it would break the invariant that a generation number uniquely identifies
-a tenant's attachment to a given pageserver _process_: it would merely identify the tenant's attachment
-to the pageserver _machine_ or its _on-disk-state_.
-
-Item 2. is a non-issue operationally: it's harmless to delay deletions, the only impact of objects pending deletion is
-the S3 capacity cost.
-
-Item 3. could be an issue if safekeepers are low on disk space and the control plane is unavailable for a long time. If this became an issue,
-we could adjust the safekeeper to delete segments from local disk sooner, as soon as they're uploaded to S3, rather than waiting for
-remote_consistent_lsn to advance.
-
-For a managed service, the general approach should be to make sure we are monitoring & respond fast enough
-that control plane outages are bounded in time.
-
-There is also the fact that control plane runs in a single region.
-The latency for distant regions is not a big concern for us because all request types added by this RFC are either infrequent or not in the way of the data path.
-However, we lose region isolation for the operations listed above.
-The ongoing work to split console and control will give us per-region control plane, and all operations in this RFC can be handled by these per-region control planes.
-With that in mind, we accept the trade-offs outlined in this paragraph.
-
-We will also implement an "escape hatch" config generation numbers, where in a major disaster outage,
-we may manually run pageservers with a hand-selected generation number, so that we can bring them online
-independently of a control plane.
-
-### Rollout
-
-Although there is coupling between components, we may deploy most of the new data plane components
-independently of the control plane: initially they can just use a static generation number.
-
-#### Phase 1
-
-The pageserver is deployed with some special config to:
-
- Always act like everything is generation 1 and do not wait for a control plane issued generation on attach
- Skip the places in deletion and remote_consistent_lsn updates where we would call into control plane
-
-#### Phase 2
-
-The control plane changes are deployed: control plane will now track and increment generation numbers.
-
-#### Phase 3
-
-The pageserver is deployed with its control-plane-dependent changes enabled: it will now require
-the control plane to service re-attach requests on startup, and handle generation
-validation requests.
-
-### On-disk backward compatibility
-
-Backward compatibility with existing data is straightforward:
-
- When reading the index, we may assume that any layer whose metadata doesn't include
-  generations will have a path without generation suffix.
- When locating the index file on attachment, we may use the "fallback" listing path
-  and if there is only an index without generation suffix, that is the one we load.
-
-It is not necessary to re-write existing layers: even new index files will be able
-to represent generation-less layers.
-
-### On-disk forward compatibility
-
-We will do a two phase rollout, probably over multiple releases because we will naturally
-have some of the read-side code ready before the overall functionality is ready:
-
-1. Deploy pageservers which understand the new index format and generation suffixes
-   in keys, but do not write objects with generation numbers in the keys.
-2. Deploy pageservers that write objects with generation numbers in the keys.
-
-Old pageservers will be oblivious to generation numbers. That means that they can't
-read objects with generation numbers in the name. This is why we must
-first step must deploy the ability to read, before the second step
-starts writing them.
-
-# Frequently Asked Questions
-
-## Why a generation _suffix_ rather than _prefix_?
-
-The choice is motivated by object listing, since one can list by prefix but not
-suffix.
-
-In [finding remote indices](#finding-the-remote-indices-for-timelines), we rely
-on being able to do a prefix listing for `<tenant>/<timeline>/index_part.json*`.
-That relies on the prefix listing.
-
-The converse case of using a generation prefix and listing by generation is
-not needed: one could imagine listing by generation while scrubbing (so that
-a particular generation's layers could be scrubbed), but this is not part
-of normal operations, and the [scrubber](#cleaning-up-orphan-objects-scrubbing) probably won't work that way anyway.
-
-## Wouldn't it be simpler to have a separate deletion queue per timeline?
-
-Functionally speaking, we could. That's how RemoteTimelineClient currently works,
-but this approach does not map well to a long-lived persistent queue with
-generation validation.
-
-Anything we do per-timeline generates tiny random I/O, on a pageserver with
-tens of thousands of timelines operating: to be ready for high scale, we should:
-
- A) Amortize costs where we can (e.g. a shared deletion queue)
- B) Expect to put tenants into a quiescent state while they're not
-  busy: i.e. we shouldn't keep a tenant alive to service its deletion queue.
-
-This was discussed in the [scope](#scope) part of the deletion queue section.
-
-# Appendix A: Examples of use in high availability/failover
-
-The generation numbers proposed in this RFC are adaptable to a variety of different
-failover scenarios and models. The sections below sketch how they would work in practice.
-
-### In-place restart of a pageserver
-
-"In-place" here means that the restart is done before any other element in the system
-has taken action in response to the node being down.
-
- After restart, the node issues a re-attach request to the control plane, and
-  receives new generation numbers for all its attached tenants.
- Tenants may be activated with the generation number in the re-attach response.
- If any of its attachments were in fact stale (i.e. had be reassigned to another
-  node while this node was offline), then
-  - the re-attach response will inform the tenant about this by not including
-    the tenant of this by _not_ incrementing the generation for that attachment.
-  - This will implicitly block deletions in the tenant, but as an optimization
-    the pageserver should also proactively stop doing S3 uploads when it notices this stale-generation state.
-  - The control plane is expected to eventually detach this tenant from the
-    pageserver.
-
-If the control plane does not include a tenant in the re-attach response,
-but there is still local state for the tenant in the filesystem, the pageserver
-deletes the local state in response and does not load/active the tenant.
-See the [earlier section on pageserver startup](#pageserver-attachstartup-changes) for details.
-Control plane can use this mechanism to clean up a pageserver that has been
-down for so long that all its tenants were migrated away before it came back
-up again and asked for re-attach.
-
-### Failure of a pageserver
-
-In this context, read "failure" as the most ambiguous possible case, where
-a pageserver is unavailable to clients and control plane, but may still be executing and talking
-to S3.
-
-#### Case A: re-attachment to other nodes
-
-1. Let's say node 0 becomes unresponsive in a cluster of three nodes 0, 1, 2.
-2. Some external mechanism notices that the node is unavailable and initiates
-   movement of all tenants attached to that node to a different node according
-   to some distribution rule.
-   In this example, it would mean incrementing the generation
-   of all tenants that were attached to node 0, as each tenant's assigned pageserver changes.
-3. A tenant which is now attached to node 1 will _also_ still be attached to node
-   0, from the perspective of node 0. Node 0 will still be using its old generation,
-   node 1 will be using a newer generation.
-4. S3 writes will continue from nodes 0 and 1: there will be an index_part.json-00000001
-   \_and\* an index_part.json-00000002. Objects written under the old suffix
-   after the new attachment was created do not matter from the rest of the system's
-   perspective: the endpoints are reading from the new attachment location. Objects
-   written by node 0 are just garbage that can be cleaned up at leisure. Node 0 will
-   not do any deletions because it can't synchronize with control plane, or if it could,
-   its deletion queue processing would get errors for the validation requests.
-
-#### Case B: direct node replacement with same node_id and drive
-
-This is the scenario we would experience if running pageservers in some dynamic
-VM/container environment that would auto-replace a given node_id when it became
-unresponsive, with the node's storage supplied by some network block device
-that is attached to the replacement VM/container.
-
-1. Let's say node 0 fails, and there may be some other peers but they aren't relevant.
-2. Some external mechanism notices that the node is unavailable, and creates
-   a "new node 0" (Node 0b) which is a physically separate server. The original node 0
-   (Node 0a) may still be running, because we do not assume the environment fences nodes.
-3. On startup, node 0b re-attaches and gets higher generation numbers for
-   all tenants.
-4. S3 writes continue from nodes 0a and 0b, but the writes do not collide due to different
-   generation in the suffix, and the writes from node 0a are not visible to the rest
-   of the system because endpoints are reading only from node 0b.
-
-# Appendix B: interoperability with other features
-
-## Sharded Keyspace
-
-The design in this RFC maps neatly to a sharded keyspace design where subsets of the key space
-for a tenant are assigned to different pageservers:
-
- the "unit of work" for attachments becomes something like a TenantShard rather than a Tenant
- TenantShards get generation numbers just as Tenants do.
- Write workload (ingest, compaction) for a tenant is spread out across pageservers via
-  TenantShards, but each TenantShard still has exactly one valid writer at a time.
-
-## Read replicas
-
-_This section is about a passive reader of S3 pageserver state, not a postgres
-read replica_
-
-For historical reads to LSNs below the remote persistent LSN, any node may act as a reader at any
-time: remote data is logically immutable data, and the use of deferred deletion in this RFC helps
-mitigate the fact that remote data is not _physically_ immutable (i.e. the actual data for a given
-page moves around as compaction happens).
-
-A read replica needs to be aware of generations in remote data in order to read the latest
-metadata (find the index_part.json with the latest suffix). It may either query this
-from the control plane, or find it with ListObjectsv2 request
-
-## Seamless migration
-
-To make tenant migration totally seamless, we will probably want to intentionally double-attach
-a tenant briefly, serving reads from the old node while waiting for the new node to be ready.
-
-This RFC enables that double-attachment: two nodes may be attached at the same time, with the migration destination
-having a higher generation number. The old node will be able to ingest and serve reads, but not
-do any deletes. The new node's attachment must also avoid deleting layers that the old node may
-still use. A new piece of state
-will be needed for this in the control plane's definition of an attachment.
-
-## Warm secondary locations
-
-To enable faster tenant movement after a pageserver is lost, we will probably want to spend some
-disk capacity on keeping standby locations populated with local disk data.
-
-There's no conflict between this RFC and that: implementing warm secondary locations on a per-tenant basis
-would be a separate change to the control plane to store standby location(s) for a tenant. Because
-the standbys do not write to S3, they do not need to be assigned generation numbers. When a tenant is
-re-attached to a standby location, that would increment the tenant attachment generation and this
-would work the same as any other attachment change, but with a warm cache.
-
-## Ephemeral node IDs
-
-This RFC intentionally avoids changing anything fundamental about how pageservers are identified
-and registered with the control plane, to avoid coupling the implementation of pageserver split
-brain protection with more fundamental changes in the management of the pageservers.
-
-Moving to ephemeral node IDs would provide an extra layer of
-resilience in the system, as it would prevent the control plane
-accidentally attaching to two physical nodes with the same
-generation, if somehow there were two physical nodes with
-the same node IDs (currently we rely on EC2 guarantees to
-eliminate this scenario). With ephemeral node IDs, there would be
-no possibility of that happening, no matter the behavior of
-underlying infrastructure.
-
-Nothing fundamental in the pageserver's handling of generations needs to change to handle ephemeral node IDs, since we hardly use the
-`node_id` anywhere. The `/re-attach` API would be extended
-to enable the pageserver to obtain its ephemeral ID, and provide
-some correlation identifier (e.g. EC instance ID), to help the
-control plane re-attach tenants to the same physical server that
-previously had them attached.
--- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
+++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
@@ -1,281 +0,0 @@
-
-# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
-
-* Created on: Aug 23, 2023
-* Author: Christian Schwarz
-
-## Summary
-
-This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
-Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
-
-## Motivation
-
-### Background
-
-We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
-If we crash or restart pageserver, we reconstruct the layer map from:
-1. local timeline directory contents
-2. remote `index_part.json` contents.
-
-The function that is responsible for this is called `Timeline::load_layer_map()`.
-The reconciliation process's behavior is the following:
-* local-only files will become part of the layer map as local-only layers and rescheduled for upload
-* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
-
-### The Problem
-
-There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
-The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
-As stated above, making the update to the layer map in atomic way is trivial.
-But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
-Currently, we issue the system calls one by one and hope we don't crash.
-
-What happens if we crash and restart in the middle of that system call sequence?
-We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
-
-We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
-
-### Problem's Implications For Compaction
-
-The implications of the above are primarily problematic for compaction.
-Specifically, the part of it that compacts L0 layers into L1 layers.
-
-Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
-Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
-It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
-
-If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
-This means the compaction after restart will **overwrite** the previously written L1s.
-Currently we also schedule an S3 upload of the overwritten L1.
-
-If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
-
-*However*:
-1. the file size of the overwritten L1s may not be identical, and
-2. the bit pattern of the overwritten L1s may not be identical, and,
-3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
-
-The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
-
-For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
-But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
-That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
-If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
-Effectively, the data in the L1 has become inaccessible to node B.
-If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
-
-If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
-
-In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
-
-But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
-
-Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
-**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
-
-## Design
-
-Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
-Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
-
-During **timeline load**, the only thing that matters is the remote index part content.
-Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
-The local timeline dir's `metadata` file does not matter.
-The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
-Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
-The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
-Instead, it treats the remote index part as the authoritative layer map.
-If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
-If it doesn't match, we remove the file from the local timeline dir.
-
-After load, **at runtime**, nothing changes compared to what we did before this RFC.
-The procedure for single- and multi-object changes is reproduced here for reference:
-* For any new layers that the change adds:
-  * Write them to a temporary location.
-  * While holding layer map lock:
-    * Move them to the final location.
-    * Insert into layer map.
-* Make the S3 changes.
-  We won't reproduce the remote timeline client method calls here because these are subject to change.
-  Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
-    * PUT layer files inserted by the change.
-    * PUT an index part that has insertions and deletions of the change.
-    * DELETE the layer files that are deleted by the change.
-
-Note that it is safe for the DELETE to be deferred arbitrarily.
-* If it never happens, we leak the object, but, that's not a correctness concern.
-* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
-* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
-
-## How This Solves The Problem
-
-If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
-The S3 change sequence above is obviously crash-consistent.
-If we crash before the index part PUT, then we leak the inserted layer files to S3.
-If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
-Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
-
-Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
-* atomic layer map update at runtime, currently by using an RwLock in write mode
-* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
-* local timeline dir state:
-  * irrelevant for layer map content => irrelevant for atomic updates / crash consistency
-  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
-  * if we crash before index part PUT, local layer files will be deleted
-
-## Trade-Offs
-
-### Fundamental
-
-If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
-* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
-* compaction: we lose the entire compaction iteration work; need to re-do it again
-* gc: no change to what we have today
-
-If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
-The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
-Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
-* on-demand downloads that were needed to do the work: are likely still present, not lost
-* wal ingest: currently unbounded
-* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
-  * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
-  * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
-* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
-  * I have no intuition how expensive / long-running it is in reality.
-* gc: `update_gc_info`` work (not substantial, AFAIK)
-
-To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
-However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
-We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
-However, this RFC is not constraining the design space either.
-
-### Practical
-
-#### Pageserver Restarts
-
-Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
-However, regular pageserver restart happen frequently, e.g., during weekly deploys.
-
-In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
-They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
-We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
-A longer budget would expose tenants that are done early to a longer downtime.
-A short budget would risk throwing away more work that'd have to be re-done after restart.
-
-In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
-We can mitigate this problem as follows:
-0. initially, by accepting that we need to do the work again
-1. short-term, introducing measures to cap the amount of in-flight work:
-
-   - cap upload queue length, use backpressure to slow down compaction
-   - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
-   - introducing a read-only shutdown state for tenants that are fast to shut down;
-     that state would be equivalent to the state of a tenant in hot standby / readonly mode.
-
-2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
-
-#### `disk_consistent_lsn` can go backwards
-
-`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
-Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
-Compute certainly doesn't care about `disk_consistent_lsn`.
-
-
-## Side-Effects Of This Design
-
-* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
-
-## Limitations
-
-Multi-object changes that span multiple timelines aren't covered by this RFC.
-That's fine because we currently don't need them, as evidenced by the absence
-of a Pageserver operation that holds multiple timelines' layer map lock at a time.
-
-## Impacted components
-
-Primarily pageservers.
-
-Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
-No changes to safekeepers are needed.
-
-## Alternatives considered
-
-### Alternative 1: WAL
-
-We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
-The WAL would be used to
-1. make multi-object changes atomic
-2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
-
-The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
-* New on-disk state to get right.
-* Forward- and backward-compatibility development costs in the future.
-
-### Alternative 2: Flow Everything Through `index_part.json`
-
-We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
-I.e., layer map would always be the last-persisted S3 state.
-That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
-And it might make hot standbys / read-only pageservers less of a special case in the future.
-
-But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
-
-And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
-
-Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
-
-### Alternative 3: Sequence Numbers For Layers
-
-Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
-
-```
-# before
-tenants/$tenant/timelines/$timeline/$key_and_lsn_range
-# after
-tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
-```
-
-To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
-
-This alternative does not solve atomic layer map updates.
-In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
-In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
-We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
-
-However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
-
-So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
-But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
-The proposed design in this RFC addresses both.
-
-So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
-That way, we avoid a phase where the crash-during-compaction problem is accute.
-
-## Related issues
-
- https://github.com/neondatabase/neon/issues/4749
- https://github.com/neondatabase/neon/issues/4418
-  - https://github.com/neondatabase/neon/pull/4422
- https://github.com/neondatabase/neon/issues/5077
- https://github.com/neondatabase/neon/issues/4088
-  - (re)resolutions:
-    - https://github.com/neondatabase/neon/pull/4696
-    - https://github.com/neondatabase/neon/pull/4094
-      - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
-
-Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that  assumption in order to fix the problem.
-
-
-## Implementation Plan
-
-1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
-
-    - The nasty part here is to fix all the tests that fiddle with the local timeline directory.
-      Possibly they are just irrelevant with this change, but, each case will require inspection.
-
-2. Implement the design above.
-
-    - Initially, ship without the mitigations for restart and accept we will do some work twice.
-    - Measure the impact and implement one of the mitigations.
-
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -89,8 +89,6 @@ impl RemoteExtSpec {
        &self,
        ext_name: &str,
        is_library: bool,
-        build_tag: &str,
-        pg_major_version: &str,
    ) -> anyhow::Result<(String, RemotePath)> {
        let mut real_ext_name = ext_name;
        if is_library {
@@ -106,32 +104,11 @@ impl RemoteExtSpec {
                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
        }

-        // Check if extension is present in public or custom.
-        // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &self.public_extensions {
-            if !public_extensions.contains(&real_ext_name.to_string()) {
-                if let Some(custom_extensions) = &self.custom_extensions {
-                    if !custom_extensions.contains(&real_ext_name.to_string()) {
-                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
-                    }
-                }
-            }
-        }
-
        match self.extension_data.get(real_ext_name) {
-            Some(_ext_data) => {
-                // Construct the path to the extension archive
-                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
-                //
-                // Keep it in sync with path generation in
-                // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str =
-                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
-                Ok((
-                    real_ext_name.to_string(),
-                    RemotePath::from_string(&archive_path_str)?,
-                ))
-            }
+            Some(ext_data) => Ok((
+                real_ext_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            )),
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -1,52 +0,0 @@
-//! Types in this file are for pageserver's upward-facing API calls to the control plane,
-//! required for acquiring and validating tenant generation numbers.
-//!
-//! See docs/rfcs/025-generation-numbers.md
-
-use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
-use utils::id::{NodeId, TenantId};
-
-#[derive(Serialize, Deserialize)]
-pub struct ReAttachRequest {
-    pub node_id: NodeId,
-}
-
-#[serde_as]
-#[derive(Serialize, Deserialize)]
-pub struct ReAttachResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
-    pub id: TenantId,
-    pub generation: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct ReAttachResponse {
-    pub tenants: Vec<ReAttachResponseTenant>,
-}
-
-#[serde_as]
-#[derive(Serialize, Deserialize)]
-pub struct ValidateRequestTenant {
-    #[serde_as(as = "DisplayFromStr")]
-    pub id: TenantId,
-    pub gen: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct ValidateRequest {
-    pub tenants: Vec<ValidateRequestTenant>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct ValidateResponse {
-    pub tenants: Vec<ValidateResponseTenant>,
-}
-
-#[serde_as]
-#[derive(Serialize, Deserialize)]
-pub struct ValidateResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
-    pub id: TenantId,
-    pub valid: bool,
-}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,7 +1,6 @@
 use const_format::formatcp;

 /// Public API types
-pub mod control_api;
 pub mod models;
 pub mod reltag;

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -194,22 +194,10 @@ pub struct TimelineCreateRequest {
 pub struct TenantCreateRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
-#[derive(Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantLoadRequest {
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
-}
-
 impl std::ops::Deref for TenantCreateRequest {
    type Target = TenantConfig;

@@ -253,6 +241,15 @@ pub struct StatusResponse {
    pub id: NodeId,
 }

+impl TenantCreateRequest {
+    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
+        TenantCreateRequest {
+            new_tenant_id,
+            config: TenantConfig::default(),
+        }
+    }
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
@@ -296,11 +293,9 @@ impl TenantConfigRequest {
    }
 }

-#[derive(Debug, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct TenantAttachRequest {
    pub config: TenantAttachConfig,
-    #[serde(default)]
-    pub generation: Option<u32>,
 }

 /// Newtype to enforce deny_unknown_fields on TenantConfig for
@@ -381,8 +376,6 @@ pub struct TimelineInfo {
    pub pg_version: u32,

    pub state: TimelineState,
-
-    pub walreceiver_status: String,
 }

 #[derive(Debug, Clone, Serialize)]
--- a/libs/postgres_ffi/README.md
+++ b/libs/postgres_ffi/README.md
@@ -10,11 +10,9 @@ should be auto-generated too, but that's a TODO.
 The PostgreSQL on-disk file format is not portable across different
 CPU architectures and operating systems. It is also subject to change
 in each major PostgreSQL version. Currently, this module supports
-PostgreSQL v14, v15 and v16: bindings and code that depends on them are
-version-specific.
-This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and
-`postgres_ffi::v16`. Version independent code is explicitly exported into
-shared `postgres_ffi`.
+PostgreSQL v14 and v15: bindings and code that depends on them are version-specific.
+This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15`
+Version independend code is explicitly exported into shared `postgres_ffi`.


 TODO: Currently, there is also some code that deals with WAL records
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
        PathBuf::from("pg_install")
    };

-    for pg_version in &["v14", "v15", "v16"] {
+    for pg_version in &["v14", "v15"] {
        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
        if pg_install_dir_versioned.is_relative() {
            let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -125,7 +125,6 @@ fn main() -> anyhow::Result<()> {
            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
-            .allowlist_type("RelMapFile")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -51,59 +51,11 @@ macro_rules! for_all_postgres_versions {
    ($macro:tt) => {
        $macro!(v14);
        $macro!(v15);
-        $macro!(v16);
    };
 }

 for_all_postgres_versions! { postgres_ffi }

-/// dispatch_pgversion
-///
-/// Run a code block in a context where the postgres_ffi bindings for a
-/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv
-/// identifier.
-/// If the provided pg_version is not supported, we panic!(), unless the
-/// optional third argument was provided (in which case that code will provide
-/// the default handling instead).
-///
-/// Use like
-///
-/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE })
-/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE)
-///
-/// Other uses are for macro-internal purposes only and strictly unsupported.
-///
-#[macro_export]
-macro_rules! dispatch_pgversion {
-    ($version:expr, $code:expr) => {
-        dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version))
-    };
-    ($version:expr, $code:expr, $invalid_pgver_handling:expr) => {
-        dispatch_pgversion!(
-            $version => $code,
-            default = $invalid_pgver_handling,
-            pgversions = [
-                14 : v14,
-                15 : v15,
-                16 : v16,
-            ]
-        )
-    };
-    ($pgversion:expr => $code:expr,
-     default = $default:expr,
-     pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
-        match ($pgversion) {
-            $($sv => {
-                use $crate::$vsv as pgv;
-                $code
-            },)+
-            _ => {
-                $default
-            }
-        }
-    };
-}
-
 pub mod pg_constants;
 pub mod relfile_utils;

@@ -138,7 +90,13 @@ pub use v14::xlog_utils::XLogFileName;
 pub use v14::bindings::DBState_DB_SHUTDOWNED;

 pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
-    dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
+    match version {
+        14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
+        15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
+            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
+            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
+        _ => anyhow::bail!("Unknown version {}", version),
+    }
 }

 pub fn generate_wal_segment(
@@ -149,11 +107,11 @@ pub fn generate_wal_segment(
 ) -> Result<Bytes, SerializeError> {
    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));

-    dispatch_pgversion!(
-        pg_version,
-        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        Err(SerializeError::BadInput)
-    )
+    match pg_version {
+        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        _ => Err(SerializeError::BadInput),
+    }
 }

 pub fn generate_pg_control(
@@ -162,11 +120,11 @@ pub fn generate_pg_control(
    lsn: Lsn,
    pg_version: u32,
 ) -> anyhow::Result<(Bytes, u64)> {
-    dispatch_pgversion!(
-        pg_version,
-        pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        anyhow::bail!("Unknown version {}", pg_version)
-    )
+    match pg_version {
+        14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
+        15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
+        _ => anyhow::bail!("Unknown version {}", pg_version),
+    }
 }

 // PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
@@ -238,6 +196,8 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
 }

 pub mod waldecoder {
+
+    use crate::{v14, v15};
    use bytes::{Buf, Bytes, BytesMut};
    use std::num::NonZeroU32;
    use thiserror::Error;
@@ -288,17 +248,22 @@ pub mod waldecoder {
        }

        pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-            dispatch_pgversion!(
-                self.pg_version,
-                {
-                    use pgv::waldecoder_handler::WalStreamDecoderHandler;
+            match self.pg_version {
+                // This is a trick to support both versions simultaneously.
+                // See WalStreamDecoderHandler comments.
+                14 => {
+                    use self::v14::waldecoder_handler::WalStreamDecoderHandler;
                    self.poll_decode_internal()
-                },
-                Err(WalDecodeError {
+                }
+                15 => {
+                    use self::v15::waldecoder_handler::WalStreamDecoderHandler;
+                    self.poll_decode_internal()
+                }
+                _ => Err(WalDecodeError {
                    msg: format!("Unknown version {}", self.pg_version),
                    lsn: self.lsn,
-                })
-            )
+                }),
+            }
        }
    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -163,20 +163,6 @@ pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
 pub const RM_LOGICALMSG_ID: u8 = 21;

-// from neon_rmgr.h
-pub const RM_NEON_ID: u8 = 134;
-
-pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80;
-
-pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00;
-pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10;
-pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20;
-pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30;
-pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40;
-pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50;
-
-pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40;
-
 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
 pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -3,8 +3,3 @@ pub const XLOG_DBASE_DROP: u8 = 0x10;

 pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
-pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
-
-pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
-    (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
-}
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -1,18 +1,10 @@
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
-pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00;
 pub const XLOG_DBASE_DROP: u8 = 0x20;

 pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
 pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
-
-pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
-
-pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
-    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
-
-    (bimg_info & ANY_COMPRESS_FLAG) != 0
-}
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -1,18 +0,0 @@
-pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
-
-pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
-pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
-pub const XLOG_DBASE_DROP: u8 = 0x20;
-
-pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
-pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
-pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
-pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
-
-pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
-
-pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
-    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
-
-    (bimg_info & ANY_COMPRESS_FLAG) != 0
-}
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -49,9 +49,9 @@ impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        #[allow(clippy::manual_range_patterns)]
        match self.pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 => Ok(path.join(format!("v{}", self.pg_version))),
+            15 => Ok(path.join(format!("v{}", self.pg_version))),
            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }
@@ -250,18 +250,11 @@ fn craft_internal<C: postgres::GenericClient>(
    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
-        Some(last_lsn) => {
-            let insert_lsn = client.pg_current_wal_insert_lsn()?;
-            match last_lsn.cmp(&insert_lsn) {
-                Ordering::Less => bail!(
-                    "Some records were inserted after the crafted WAL: {} vs {}",
-                    last_lsn,
-                    insert_lsn
-                ),
-                Ordering::Equal => last_lsn,
-                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
-            }
-        }
+        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
+            Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
+            Ordering::Equal => last_lsn,
+            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+        },
    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
@@ -370,9 +363,8 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        );
        ensure!(
            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            after_xlog_switch,
-            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
+            "XLOG_SWITCH message ended not on page boundary: {}",
+            after_xlog_switch
        );
        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -959,7 +959,7 @@ mod tests {
        let make_params = |options| StartupMessageParams::new([("options", options)]);

        let params = StartupMessageParams::new([]);
-        assert!(params.options_escaped().is_none());
+        assert!(matches!(params.options_escaped(), None));

        let params = make_params("");
        assert!(split_options(&params).is_empty());
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -148,55 +148,21 @@ impl RemoteStorage for LocalFs {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
        };
-
-        // If we were given a directory, we may use it as our starting point.
-        // Otherwise, we must go up to the parent directory.  This is because
-        // S3 object list prefixes can be arbitrary strings, but when reading
-        // the local filesystem we need a directory to start calling read_dir on.
-        let mut initial_dir = full_path.clone();
-        match fs::metadata(full_path.clone()).await {
-            Ok(meta) => {
-                if !meta.is_dir() {
-                    // It's not a directory: strip back to the parent
-                    initial_dir.pop();
-                }
-            }
-            Err(e) if e.kind() == ErrorKind::NotFound => {
-                // It's not a file that exists: strip the prefix back to the parent directory
-                initial_dir.pop();
-            }
-            Err(e) => {
-                // Unexpected I/O error
-                anyhow::bail!(e)
-            }
-        }
-
-        // Note that PathBuf starts_with only considers full path segments, but
-        // object prefixes are arbitrary strings, so we need the strings for doing
-        // starts_with later.
-        let prefix = full_path.to_string_lossy();
-
        let mut files = vec![];
-        let mut directory_queue = vec![initial_dir.clone()];
+        let mut directory_queue = vec![full_path.clone()];
+
        while let Some(cur_folder) = directory_queue.pop() {
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
                let full_file_name = cur_folder.clone().join(&file_name);
-                if full_file_name
-                    .to_str()
-                    .map(|s| s.starts_with(prefix.as_ref()))
-                    .unwrap_or(false)
-                {
-                    let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
-                    files.push(file_remote_path.clone());
-                    if full_file_name.is_dir() {
-                        directory_queue.push(full_file_name);
-                    }
+                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
+                files.push(file_remote_path.clone());
+                if full_file_name.is_dir() {
+                    directory_queue.push(full_file_name);
                }
            }
        }
-
        Ok(files)
    }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -573,7 +573,7 @@ mod tests {

    #[test]
    fn relative_path() {
-        let all_paths = ["", "some/path", "some/path/"];
+        let all_paths = vec!["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -31,8 +31,6 @@ fn lsn_invalid() -> Lsn {
 #[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
-    /// Term.
-    pub term: Option<u64>,
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
@@ -60,6 +58,4 @@ pub struct SkTimelineInfo {
    /// A connection string to use for WAL receiving.
    #[serde(default)]
    pub safekeeper_connstr: Option<String>,
-    #[serde(default)]
-    pub http_connstr: Option<String>,
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -38,7 +38,6 @@ url.workspace = true
 uuid.workspace = true

 pq_proto.workspace = true
-postgres_connection.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -9,12 +9,11 @@ PORT=$4
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
 rm -fr "$DATA_DIR"
 env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
-echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
-echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
+echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
 cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
 cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
 for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -1,138 +0,0 @@
-use std::fmt::Debug;
-
-use serde::{Deserialize, Serialize};
-
-/// Tenant generations are used to provide split-brain safety and allow
-/// multiple pageservers to attach the same tenant concurrently.
-///
-/// See docs/rfcs/025-generation-numbers.md for detail on how generation
-/// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
-pub enum Generation {
-    // Generations with this magic value will not add a suffix to S3 keys, and will not
-    // be included in persisted index_part.json.  This value is only to be used
-    // during migration from pre-generation metadata to generation-aware metadata,
-    // and should eventually go away.
-    //
-    // A special Generation is used rather than always wrapping Generation in an Option,
-    // so that code handling generations doesn't have to be aware of the legacy
-    // case everywhere it touches a generation.
-    None,
-    // Generations with this magic value may never be used to construct S3 keys:
-    // we will panic if someone tries to.  This is for Tenants in the "Broken" state,
-    // so that we can satisfy their constructor with a Generation without risking
-    // a code bug using it in an S3 write (broken tenants should never write)
-    Broken,
-    Valid(u32),
-}
-
-/// The Generation type represents a number associated with a Tenant, which
-/// increments every time the tenant is attached to a new pageserver, or
-/// an attached pageserver restarts.
-///
-/// It is included as a suffix in S3 keys, as a protection against split-brain
-/// scenarios where pageservers might otherwise issue conflicting writes to
-/// remote storage
-impl Generation {
-    /// Create a new Generation that represents a legacy key format with
-    /// no generation suffix
-    pub fn none() -> Self {
-        Self::None
-    }
-
-    // Create a new generation that will panic if you try to use get_suffix
-    pub fn broken() -> Self {
-        Self::Broken
-    }
-
-    pub fn new(v: u32) -> Self {
-        Self::Valid(v)
-    }
-
-    pub fn is_none(&self) -> bool {
-        matches!(self, Self::None)
-    }
-
-    #[track_caller]
-    pub fn get_suffix(&self) -> String {
-        match self {
-            Self::Valid(v) => {
-                format!("-{:08x}", v)
-            }
-            Self::None => "".into(),
-            Self::Broken => {
-                panic!("Tried to use a broken generation");
-            }
-        }
-    }
-
-    /// `suffix` is the part after "-" in a key
-    ///
-    /// Returns None if parsing was unsuccessful
-    pub fn parse_suffix(suffix: &str) -> Option<Generation> {
-        u32::from_str_radix(suffix, 16).map(Generation::new).ok()
-    }
-
-    #[track_caller]
-    pub fn previous(&self) -> Generation {
-        match self {
-            Self::Valid(n) => {
-                if *n == 0 {
-                    // Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation
-                    // to 0 as being "no generation".
-                    Self::None
-                } else {
-                    Self::Valid(n - 1)
-                }
-            }
-            Self::None => Self::None,
-            Self::Broken => panic!("Attempted to use a broken generation"),
-        }
-    }
-}
-
-impl Serialize for Generation {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if let Self::Valid(v) = self {
-            v.serialize(serializer)
-        } else {
-            // We should never be asked to serialize a None or Broken.  Structures
-            // that include an optional generation should convert None to an
-            // Option<Generation>::None
-            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation ({self})",
-            ))
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Generation {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        Ok(Self::Valid(u32::deserialize(deserializer)?))
-    }
-}
-
-// We intentionally do not implement Display for Generation, to reduce the
-// risk of a bug where the generation is used in a format!() string directly
-// instead of using get_suffix().
-impl Debug for Generation {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::Valid(v) => {
-                write!(f, "{:08x}", v)
-            }
-            Self::None => {
-                write!(f, "<none>")
-            }
-            Self::Broken => {
-                write!(f, "<broken>")
-            }
-        }
-    }
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -27,9 +27,6 @@ pub mod id;
 // http endpoint utils
 pub mod http;

-// definition of the Generation type for pageserver attachment APIs
-pub mod generation;
-
 // common log initialisation routine
 pub mod logging;

@@ -61,8 +58,6 @@ pub mod serde_regex;

 pub mod pageserver_feedback;

-pub mod postgres_client;
-
 pub mod tracing_span_assert;

 pub mod rate_limit;
@@ -73,6 +68,8 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+pub mod sync;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -1,37 +0,0 @@
-//! Postgres client connection code common to other crates (safekeeper and
-//! pageserver) which depends on tenant/timeline ids and thus not fitting into
-//! postgres_connection crate.
-
-use anyhow::Context;
-use postgres_connection::{parse_host_port, PgConnectionConfig};
-
-use crate::id::TenantTimelineId;
-
-/// Create client config for fetching WAL from safekeeper on particular timeline.
-/// listen_pg_addr_str is in form host:\[port\].
-pub fn wal_stream_connection_config(
-    TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    }: TenantTimelineId,
-    listen_pg_addr_str: &str,
-    auth_token: Option<&str>,
-    availability_zone: Option<&str>,
-) -> anyhow::Result<PgConnectionConfig> {
-    let (host, port) =
-        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
-    let port = port.unwrap_or(5432);
-    let mut connstr = PgConnectionConfig::new_host_port(host, port)
-        .extend_options([
-            "-c".to_owned(),
-            format!("timeline_id={}", timeline_id),
-            format!("tenant_id={}", tenant_id),
-        ])
-        .set_password(auth_token.map(|s| s.to_owned()));
-
-    if let Some(availability_zone) = availability_zone {
-        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
-    }
-
-    Ok(connstr)
-}
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -0,0 +1 @@
+pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -0,0 +1,306 @@
+use std::sync::{Arc, Mutex, MutexGuard};
+use tokio::sync::Semaphore;
+
+/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// for the duration of initialization.
+///
+/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
+///
+/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
+pub struct OnceCell<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+impl<T> Default for OnceCell<T> {
+    /// Create new uninitialized [`OnceCell`].
+    fn default() -> Self {
+        Self {
+            inner: Default::default(),
+        }
+    }
+}
+
+/// Semaphore is the current state:
+/// - open semaphore means the value is `None`, not yet initialized
+/// - closed semaphore means the value has been initialized
+#[derive(Debug)]
+struct Inner<T> {
+    init_semaphore: Arc<Semaphore>,
+    value: Option<T>,
+}
+
+impl<T> Default for Inner<T> {
+    fn default() -> Self {
+        Self {
+            init_semaphore: Arc::new(Semaphore::new(1)),
+            value: None,
+        }
+    }
+}
+
+impl<T> OnceCell<T> {
+    /// Creates an already initialized `OnceCell` with the given value.
+    pub fn new(value: T) -> Self {
+        let sem = Semaphore::new(1);
+        sem.close();
+        Self {
+            inner: Mutex::new(Inner {
+                init_semaphore: Arc::new(sem),
+                value: Some(value),
+            }),
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    where
+        F: FnOnce() -> Fut,
+        Fut: std::future::Future<Output = Result<T, E>>,
+    {
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = sem.acquire_owned().await;
+        if permit.is_err() {
+            let guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_some(),
+                "semaphore got closed, must be initialized"
+            );
+            return Ok(Guard(guard));
+        } else {
+            // now we try
+            let value = factory().await?;
+
+            let mut guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_none(),
+                "we won permit, must not be initialized"
+            );
+            guard.value = Some(value);
+            guard.init_semaphore.close();
+            Ok(Guard(guard))
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
+        if guard.value.is_some() {
+            Some(Guard(guard))
+        } else {
+            None
+        }
+    }
+}
+
+/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
+/// initialized value.
+#[derive(Debug)]
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for Guard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> std::ops::DerefMut for Guard<'_, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0
+            .value
+            .as_mut()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<'a, T> Guard<'a, T> {
+    /// Take the current value, and a new permit for it's deinitialization.
+    ///
+    /// The permit will be on a semaphore part of the new internal value, and any following
+    /// [`OnceCell::get_or_init`] will wait on it to complete.
+    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
+        let mut swapped = Inner::default();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, permit))
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        convert::Infallible,
+        sync::atomic::{AtomicUsize, Ordering},
+        time::Duration,
+    };
+
+    #[tokio::test]
+    async fn many_initializers() {
+        #[derive(Default, Debug)]
+        struct Counters {
+            factory_got_to_run: AtomicUsize,
+            future_polled: AtomicUsize,
+            winners: AtomicUsize,
+        }
+
+        let initializers = 100;
+
+        let cell = Arc::new(OnceCell::default());
+        let counters = Arc::new(Counters::default());
+        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
+
+        let mut js = tokio::task::JoinSet::new();
+        for i in 0..initializers {
+            js.spawn({
+                let cell = cell.clone();
+                let counters = counters.clone();
+                let barrier = barrier.clone();
+
+                async move {
+                    barrier.wait().await;
+                    let won = {
+                        let g = cell
+                            .get_or_init(|| {
+                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
+                                async {
+                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
+                                    Ok::<_, Infallible>(i)
+                                }
+                            })
+                            .await
+                            .unwrap();
+
+                        *g == i
+                    };
+
+                    if won {
+                        counters.winners.fetch_add(1, Ordering::Relaxed);
+                    }
+                }
+            });
+        }
+
+        barrier.wait().await;
+
+        while let Some(next) = js.join_next().await {
+            next.expect("no panics expected");
+        }
+
+        let mut counters = Arc::try_unwrap(counters).unwrap();
+
+        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
+        assert_eq!(*counters.future_polled.get_mut(), 1);
+        assert_eq!(*counters.winners.get_mut(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reinit_waits_for_deinit() {
+        // with he tokio::time paused, we will "sleep" for 1s while holding the reinitialization
+        let sleep_for = Duration::from_secs(1);
+        let initial = 42;
+        let reinit = 1;
+        let cell = Arc::new(OnceCell::new(initial));
+
+        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
+
+        let jh = tokio::spawn({
+            let cell = cell.clone();
+            let deinitialization_started = deinitialization_started.clone();
+            async move {
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                assert_eq!(answer, initial);
+
+                deinitialization_started.wait().await;
+                tokio::time::sleep(sleep_for).await;
+            }
+        });
+
+        deinitialization_started.wait().await;
+
+        let started_at = tokio::time::Instant::now();
+        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
+            .await
+            .unwrap();
+
+        let elapsed = started_at.elapsed();
+        assert!(
+            elapsed >= sleep_for,
+            "initialization should had taken at least the time time slept with permit"
+        );
+
+        jh.await.unwrap();
+
+        assert_eq!(*cell.get().unwrap(), reinit);
+    }
+
+    #[tokio::test]
+    async fn initialization_attemptable_until_ok() {
+        let cell = OnceCell::default();
+
+        for _ in 0..10 {
+            cell.get_or_init(|| async { Err("whatever error") })
+                .await
+                .unwrap_err();
+        }
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "finally success");
+    }
+
+    #[tokio::test]
+    async fn initialization_is_cancellation_safe() {
+        let cell = OnceCell::default();
+
+        let barrier = tokio::sync::Barrier::new(2);
+
+        let initializer = cell.get_or_init(|| async {
+            barrier.wait().await;
+            futures::future::pending::<()>().await;
+
+            Ok::<_, Infallible>("never reached")
+        });
+
+        tokio::select! {
+            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
+            _ = barrier.wait() => {}
+        };
+
+        // now initializer is dropped
+
+        assert!(cell.get().is_none());
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "now initialized");
+    }
+}
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -16,19 +16,3 @@ in the `neon-postgres` cgroup and set its `memory.{max,high}`.
 * See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
 where initial development of the monitor happened. The repository is no longer
 maintained but the commit history may be useful for debugging.
-
-## Structure
-
-The `vm-monitor` is loosely comprised of a few systems. These are:
-* the server: this is just a simple `axum` server that accepts requests and
-upgrades them to websocket connections. The server only allows one connection at
-a time. This means that upon receiving a new connection, the server will terminate
-and old one if it exists.
-* the filecache: a struct that allows communication with the Postgres file cache.
-On startup, we connect to the filecache and hold on to the connection for the
-entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
-listening for `memory.high` events and setting its `memory.{high,max}` values.
-* the runner: the runner marries the filecache and cgroup watcher together,
-communicating with the agent throught the `Dispatcher`, and then calling filecache
-and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -315,8 +315,12 @@ impl CgroupWatcher {
    where
        E: Stream<Item = Sequenced<u64>>,
    {
+        // There are several actions might do when receiving a `memory.high`,
+        // such as freezing the cgroup, or increasing its `memory.high`. We don't
+        // want to do these things too often (because postgres needs to run, and
+        // we only have so much memory). These timers serve as rate limits for this.
        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut last_memory_high_increase_at: Option<Instant> = None;
+        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
        let mut events = pin!(events);

        // Are we waiting to be upscaled? Could be true if we request upscale due
@@ -328,8 +332,6 @@ impl CgroupWatcher {
                upscale = upscales.recv() => {
                    let Sequenced { seqnum, data } = upscale
                        .context("failed to listen on upscale notification channel")?;
-                    waiting_on_upscale = false;
-                    last_memory_high_increase_at = None;
                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
                }
@@ -394,17 +396,12 @@ impl CgroupWatcher {
                            .send(())
                            .await
                            .context("failed to request upscale")?;
-                        waiting_on_upscale = true;
                        continue;
                    }

                    // Shoot, we can't freeze or and we're still waiting on upscale,
                    // increase memory.high to reduce throttling
-                    let can_increase_memory_high = match last_memory_high_increase_at {
-                        None => true,
-                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
-                    };
-                    if can_increase_memory_high {
+                    if wait_to_increase_memory_high.is_elapsed() {
                        info!(
                            "received memory.high event, \
                            but too soon to refreeze and already requested upscale \
@@ -440,11 +437,12 @@ impl CgroupWatcher {
                        );
                        self.set_high_bytes(new_high)
                            .context("failed to set memory.high")?;
-                        last_memory_high_increase_at = Some(Instant::now());
-                        continue;
+                        wait_to_increase_memory_high
+                            .as_mut()
+                            .reset(Instant::now() + self.config.memory_high_increase_every)
                    }

-                    info!("received memory.high event, but can't do anything");
+                    // we can't do anything
                }
            };
        }
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -1,7 +1,7 @@
 //! Managing the websocket connection and other signals in the monitor.
 //!
 //! Contains types that manage the interaction (not data interchange, see `protocol`)
-//! between agent and monitor, allowing us to to process and send messages in a
+//! between informant and monitor, allowing us to to process and send messages in a
 //! straightforward way. The dispatcher also manages that signals that come from
 //! the cgroup (requesting upscale), and the signals that go to the cgroup
 //! (notifying it of upscale).
@@ -24,16 +24,16 @@ use crate::protocol::{
 /// The central handler for all communications in the monitor.
 ///
 /// The dispatcher has two purposes:
-/// 1. Manage the connection to the agent, sending and receiving messages.
+/// 1. Manage the connection to the informant, sending and receiving messages.
 /// 2. Communicate with the cgroup manager, notifying it when upscale is received,
-///    and sending a message to the agent when the cgroup manager requests
+///    and sending a message to the informant when the cgroup manager requests
 ///    upscale.
 #[derive(Debug)]
 pub struct Dispatcher {
-    /// We read agent messages of of `source`
+    /// We read informant messages of of `source`
    pub(crate) source: SplitStream<WebSocket>,

-    /// We send messages to the agent through `sink`
+    /// We send messages to the informant through `sink`
    sink: SplitSink<WebSocket, Message>,

    /// Used to notify the cgroup when we are upscaled.
@@ -43,7 +43,7 @@ pub struct Dispatcher {
    /// we send an `UpscaleRequst` to the agent.
    pub(crate) request_upscale_events: mpsc::Receiver<()>,

-    /// The protocol version we have agreed to use with the agent. This is negotiated
+    /// The protocol version we have agreed to use with the informant. This is negotiated
    /// during the creation of the dispatcher, and should be the highest shared protocol
    /// version.
    ///
@@ -56,9 +56,9 @@ pub struct Dispatcher {
 impl Dispatcher {
    /// Creates a new dispatcher using the passed-in connection.
    ///
-    /// Performs a negotiation with the agent to determine the highest protocol
+    /// Performs a negotiation with the informant to determine the highest protocol
    /// version that both support. This consists of two steps:
-    /// 1. Wait for the agent to sent the range of protocols it supports.
+    /// 1. Wait for the informant to sent the range of protocols it supports.
    /// 2. Send a protocol version that works for us as well, or an error if there
    ///    is no compatible version.
    pub async fn new(
@@ -69,7 +69,7 @@ impl Dispatcher {
        let (mut sink, mut source) = stream.split();

        // Figure out the highest protocol version we both support
-        info!("waiting for agent to send protocol version range");
+        info!("waiting for informant to send protocol version range");
        let Some(message) = source.next().await else {
            bail!("websocket connection closed while performing protocol handshake")
        };
@@ -79,7 +79,7 @@ impl Dispatcher {
        let Message::Text(message_text) = message else {
            // All messages should be in text form, since we don't do any
            // pinging/ponging. See nhooyr/websocket's implementation and the
-            // agent for more info
+            // informant/agent for more info
            bail!("received non-text message during proocol handshake: {message:?}")
        };

@@ -88,30 +88,32 @@ impl Dispatcher {
            max: PROTOCOL_MAX_VERSION,
        };

-        let agent_range: ProtocolRange = serde_json::from_str(&message_text)
+        let informant_range: ProtocolRange = serde_json::from_str(&message_text)
            .context("failed to deserialize protocol version range")?;

-        info!(range = ?agent_range, "received protocol version range");
+        info!(range = ?informant_range, "received protocol version range");

-        let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
+        let highest_shared_version = match monitor_range.highest_shared_version(&informant_range) {
            Ok(version) => {
                sink.send(Message::Text(
                    serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
                ))
                .await
-                .context("failed to notify agent of negotiated protocol version")?;
+                .context("failed to notify informant of negotiated protocol version")?;
                version
            }
            Err(e) => {
                sink.send(Message::Text(
                    serde_json::to_string(&ProtocolResponse::Error(format!(
                        "Received protocol version range {} which does not overlap with {}",
-                        agent_range, monitor_range
+                        informant_range, monitor_range
                    )))
                    .unwrap(),
                ))
                .await
-                .context("failed to notify agent of no overlap between protocol version ranges")?;
+                .context(
+                    "failed to notify informant of no overlap between protocol version ranges",
+                )?;
                Err(e).context("error determining suitable protocol version range")?
            }
        };
@@ -135,7 +137,7 @@ impl Dispatcher {
            .context("failed to send resources and oneshot sender across channel")
    }

-    /// Send a message to the agent.
+    /// Send a message to the informant.
    ///
    /// Although this function is small, it has one major benefit: it is the only
    /// way to send data accross the connection, and you can only pass in a proper
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -59,8 +59,8 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl FileCacheConfig {
-    pub fn default_in_memory() -> Self {
+impl Default for FileCacheConfig {
+    fn default() -> Self {
        Self {
            in_memory: true,
            // 75 %
@@ -71,19 +71,9 @@ impl FileCacheConfig {
            spread_factor: 0.1,
        }
    }
+}

-    pub fn default_on_disk() -> Self {
-        Self {
-            in_memory: false,
-            resource_multiplier: 0.75,
-            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
-            // memory, the kernel will just evict from its page cache, rather than e.g. killing
-            // everything.
-            min_remaining_after_cache: NonZeroU64::new(256 * MiB).unwrap(),
-            spread_factor: 0.1,
-        }
-    }
-
+impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,16 +39,6 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

-    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
-    /// kernel's page cache), and therefore should not count against available memory.
-    //
-    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
-    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
-    // during the switch away from an in-memory file cache, we had to default to the previous
-    // behavior.
-    #[arg(long)]
-    pub file_cache_on_disk: bool,
-
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
@@ -156,7 +146,7 @@ pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Res

 /// Handles incoming websocket connections.
 ///
-/// If we are already to connected to an agent, we kill that old connection
+/// If we are already to connected to an informant, we kill that old connection
 /// and accept the new one.
 #[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
 pub async fn ws_handler(
@@ -178,17 +168,14 @@ pub async fn ws_handler(

 /// Starts the monitor. If startup fails or the monitor exits, an error will
 /// be logged and our internal state will be reset to allow for new connections.
-#[tracing::instrument(skip_all)]
+#[tracing::instrument(skip_all, fields(?args))]
 async fn start_monitor(
    ws: WebSocket,
    args: &Args,
    kill: broadcast::Receiver<()>,
    token: CancellationToken,
 ) {
-    info!(
-        ?args,
-        "accepted new websocket connection -> starting monitor"
-    );
+    info!("accepted new websocket connection -> starting monitor");
    let timeout = Duration::from_secs(4);
    let monitor = tokio::time::timeout(
        timeout,
@@ -209,7 +196,7 @@ async fn start_monitor(
            return;
        }
    };
-    info!("connected to agent");
+    info!("connected to informant");

    match monitor.run().await {
        Ok(()) => info!("monitor was killed due to new connection"),
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -1,13 +1,13 @@
-//! Types representing protocols and actual agent-monitor messages.
+//! Types representing protocols and actual informant-monitor messages.
 //!
 //! The pervasive use of serde modifiers throughout this module is to ease
 //! serialization on the go side. Because go does not have enums (which model
 //! messages well), it is harder to model messages, and we accomodate that with
 //! serde.
 //!
-//! *Note*: the agent sends and receives messages in different ways.
+//! *Note*: the informant sends and receives messages in different ways.
 //!
-//! The agent serializes messages in the form and then sends them. The use
+//! The informant serializes messages in the form and then sends them. The use
 //! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
 //! to determine how to deserialize `Content`.
 //! ```ignore
@@ -25,9 +25,9 @@
 //!     Id   uint64
 //! }
 //! ```
-//! After reading the type field, the agent will decode the entire message
+//! After reading the type field, the informant will decode the entire message
 //! again, this time into the correct type using the embedded fields.
-//! Because the agent cannot just extract the json contained in a certain field
+//! Because the informant cannot just extract the json contained in a certain field
 //! (it initially deserializes to `map[string]interface{}`), we keep the fields
 //! at the top level, so the entire piece of json can be deserialized into a struct,
 //! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
@@ -37,7 +37,7 @@ use std::cmp;

 use serde::{de::Error, Deserialize, Serialize};

-/// A Message we send to the agent.
+/// A Message we send to the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct OutboundMsg {
    #[serde(flatten)]
@@ -51,31 +51,31 @@ impl OutboundMsg {
    }
 }

-/// The different underlying message types we can send to the agent.
+/// The different underlying message types we can send to the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(tag = "type")]
 pub enum OutboundMsgKind {
-    /// Indicates that the agent sent an invalid message, i.e, we couldn't
+    /// Indicates that the informant sent an invalid message, i.e, we couldn't
    /// properly deserialize it.
    InvalidMessage { error: String },
    /// Indicates that we experienced an internal error while processing a message.
    /// For example, if a cgroup operation fails while trying to handle an upscale,
    /// we return `InternalError`.
    InternalError { error: String },
-    /// Returned to the agent once we have finished handling an upscale. If the
+    /// Returned to the informant once we have finished handling an upscale. If the
    /// handling was unsuccessful, an `InternalError` will get returned instead.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    UpscaleConfirmation {},
    /// Indicates to the monitor that we are urgently requesting resources.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    UpscaleRequest {},
-    /// Returned to the agent once we have finished attempting to downscale. If
+    /// Returned to the informant once we have finished attempting to downscale. If
    /// an error occured trying to do so, an `InternalError` will get returned instead.
    /// However, if we are simply unsuccessful (for example, do to needing the resources),
    /// that gets included in the `DownscaleResult`.
    DownscaleResult {
        // FIXME for the future (once the informant is deprecated)
-        // As of the time of writing, the agent/informant version of this struct is
+        // As of the time of writing, the informant/agent version of this struct is
        // called api.DownscaleResult. This struct has uppercase fields which are
        // serialized as such. Thus, we serialize using uppercase names so we don't
        // have to make a breaking change to the agent<->informant protocol. Once
@@ -88,12 +88,12 @@ pub enum OutboundMsgKind {
        status: String,
    },
    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
-    /// agent.
+    /// informant.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    HealthCheck {},
 }

-/// A message received form the agent.
+/// A message received form the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct InboundMsg {
    #[serde(flatten)]
@@ -101,7 +101,7 @@ pub struct InboundMsg {
    pub(crate) id: usize,
 }

-/// The different underlying message types we can receive from the agent.
+/// The different underlying message types we can receive from the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(tag = "type", content = "content")]
 pub enum InboundMsgKind {
@@ -120,14 +120,14 @@ pub enum InboundMsgKind {
    /// when done.
    DownscaleRequest { target: Resources },
    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
-    /// agent.
+    /// informant.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    HealthCheck {},
 }

 /// Represents the resources granted to a VM.
 #[derive(Serialize, Deserialize, Debug, Clone, Copy)]
-// Renamed because the agent has multiple resources types:
+// Renamed because the agent/informant has multiple resources types:
 // `Resources` (milliCPU/memory slots)
 // `Allocation` (vCPU/bytes) <- what we correspond to
 #[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
@@ -151,7 +151,7 @@ pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
 pub struct ProtocolVersion(u8);

 impl ProtocolVersion {
-    /// Represents v1.0 of the agent<-> monitor protocol - the initial version
+    /// Represents v1.0 of the informant<-> monitor protocol - the initial version
    ///
    /// Currently the latest version.
    const V1_0: ProtocolVersion = ProtocolVersion(1);
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -1,11 +1,10 @@
-//! Exposes the `Runner`, which handles messages received from agent and
+//! Exposes the `Runner`, which handles messages received from informant and
 //! sends upscale requests.
 //!
 //! This is the "Monitor" part of the monitor binary and is the main entrypoint for
 //! all functionality.

 use std::sync::Arc;
-use std::time::{Duration, Instant};
 use std::{fmt::Debug, mem};

 use anyhow::{bail, Context};
@@ -22,8 +21,8 @@ use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
 use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};

-/// Central struct that interacts with agent, dispatcher, and cgroup to handle
-/// signals from the agent.
+/// Central struct that interacts with informant, dispatcher, and cgroup to handle
+/// signals from the informant.
 #[derive(Debug)]
 pub struct Runner {
    config: Config,
@@ -37,8 +36,6 @@ pub struct Runner {
    /// by us vs the autoscaler-agent.
    counter: usize,

-    last_upscale_request_at: Option<Instant>,
-
    /// A signal to kill the main thread produced by `self.run()`. This is triggered
    /// when the server receives a new connection. When the thread receives the
    /// signal off this channel, it will gracefully shutdown.
@@ -102,7 +99,6 @@ impl Runner {
            cgroup: None,
            dispatcher,
            counter: 1, // NB: must be odd, see the comment about the field for more.
-            last_upscale_request_at: None,
            kill,
        };

@@ -114,10 +110,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = match args.file_cache_on_disk {
-                true => FileCacheConfig::default_on_disk(),
-                false => FileCacheConfig::default_in_memory(),
-            };
+            let config: FileCacheConfig = Default::default();
+            if !config.in_memory {
+                panic!("file cache not in-memory implemented")
+            }

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -144,10 +140,7 @@ impl Runner {
            if actual_size != new_size {
                info!("file cache size actually got set to {actual_size}")
            }
-            // Mark the resources given to the file cache as reserved, but only if it's in memory.
-            if !args.file_cache_on_disk {
-                file_cache_reserved_bytes = actual_size;
-            }
+            file_cache_reserved_bytes = actual_size;

            state.filecache = Some(file_cache);
        }
@@ -234,17 +227,18 @@ impl Runner {
        let mut status = vec![];
        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented")
+            }
+
            let actual_usage = file_cache
                .set_file_cache_size(expected_file_cache_mem_usage)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
-            }
+            file_cache_mem_usage = actual_usage;
            let message = format!(
-                "set file cache size to {} MiB (in memory = {})",
-                bytes_to_mebibytes(actual_usage),
-                file_cache.config.in_memory,
+                "set file cache size to {} MiB",
+                bytes_to_mebibytes(actual_usage)
            );
            info!("downscale: {message}");
            status.push(message);
@@ -295,6 +289,10 @@ impl Runner {
        // Get the file cache's expected contribution to the memory usage
        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented");
+            }
+
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
                target = bytes_to_mebibytes(expected_usage),
@@ -306,9 +304,6 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
-            }

            if actual_usage != expected_usage {
                warn!(
@@ -317,6 +312,7 @@ impl Runner {
                    bytes_to_mebibytes(actual_usage)
                )
            }
+            file_cache_mem_usage = actual_usage;
        }

        if let Some(cgroup) = &self.cgroup {
@@ -375,7 +371,7 @@ impl Runner {
                Ok(None)
            }
            InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(error, id, "informant experienced an internal error");
                Ok(None)
            }
            InboundMsgKind::HealthCheck {} => {
@@ -401,20 +397,6 @@ impl Runner {
                    if request.is_none() {
                        bail!("failed to listen for upscale event from cgroup")
                    }
-
-                    // If it's been less than 1 second since the last time we requested upscaling,
-                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
-                    // ~1k times per second).
-                    if let Some(t) = self.last_upscale_request_at {
-                        let elapsed = t.elapsed();
-                        if elapsed < Duration::from_secs(1) {
-                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
-                            continue;
-                        }
-                    }
-
-                    self.last_upscale_request_at = Some(Instant::now());
-
                    info!("cgroup asking for upscale; forwarding request");
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
@@ -423,7 +405,7 @@ impl Runner {
                        .await
                        .context("failed to send message")?;
                }
-                // there is a message from the agent
+                // there is a message from the informant
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
                        // Don't use 'message' as a key as the string also uses
@@ -440,7 +422,7 @@ impl Runner {
                                            // Don't use 'message' as a key as the
                                            // string also uses that for its key
                                            msg = ?other,
-                                            "agent should only send text messages but received different type"
+                                            "informant should only send text messages but received different type"
                                        );
                                        continue
                                    },
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,7 +3,6 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
-use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
@@ -97,8 +96,8 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0).await?;
+    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -143,12 +142,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
    let mut total_excess_layers = 0usize;
-    for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? {
+    for tenant in fs::read_dir(storage_path.join("tenants"))? {
        let tenant = tenant?;
        if !tenant.file_type()?.is_dir() {
            continue;
        }
-        for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
+        for timeline in fs::read_dir(tenant.path().join("timelines"))? {
            let timeline = timeline?;
            if !timeline.file_type()?.is_dir() {
                continue;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -5,7 +5,6 @@ use clap::Subcommand;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
-use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
    repository::{Key, KEY_SIZE},
@@ -48,8 +47,8 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0).await?;
+    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -69,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            },
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&file);
+    let cursor = BlockCursor::new_fileblockreader_virtual(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
@@ -81,13 +80,13 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
 pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
-            for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
+            for tenant in fs::read_dir(path.join("tenants"))? {
                let tenant = tenant?;
                if !tenant.file_type()?.is_dir() {
                    continue;
                }
                println!("tenant {}", tenant.file_name().to_string_lossy());
-                for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
+                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
                    let timeline = timeline?;
                    if !timeline.file_type()?.is_dir() {
                        continue;
@@ -102,9 +101,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            timeline,
        } => {
            let timeline_path = path
-                .join(TENANTS_SEGMENT_NAME)
+                .join("tenants")
                .join(tenant)
-                .join(TIMELINES_SEGMENT_NAME)
+                .join("timelines")
                .join(timeline);
            let mut idx = 0;
            for layer in fs::read_dir(timeline_path)? {
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -25,7 +25,6 @@ use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

-use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
@@ -324,25 +323,14 @@ where
                .timeline
                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                .await?;
-
-            ensure!(
-                img.len()
-                    == dispatch_pgversion!(
-                        self.timeline.pg_version,
-                        pgv::bindings::SIZEOF_RELMAPFILE
-                    )
-            );
-
+            ensure!(img.len() == 512);
            Some(img)
        } else {
            None
        };

        if spcnode == GLOBALTABLESPACE_OID {
-            let pg_version_str = match self.timeline.pg_version {
-                14 | 15 => self.timeline.pg_version.to_string(),
-                ver => format!("{ver}\x0A"),
-            };
+            let pg_version_str = self.timeline.pg_version.to_string();
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
            self.ar.append(&header, pg_version_str.as_bytes()).await?;

@@ -386,10 +374,7 @@ where
            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);

-                let pg_version_str = match self.timeline.pg_version {
-                    14 | 15 => self.timeline.pg_version.to_string(),
-                    ver => format!("{ver}\x0A"),
-                };
+                let pg_version_str = self.timeline.pg_version.to_string();
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
                self.ar.append(&header, pg_version_str.as_bytes()).await?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -388,7 +388,6 @@ fn start_pageserver(
            remote_storage: remote_storage.clone(),
        },
        order,
-        shutdown_pageserver.clone(),
    ))?;

    BACKGROUND_RUNTIME.spawn({
@@ -477,19 +476,16 @@ fn start_pageserver(
    {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

-        let router_state = Arc::new(
-            http::routes::State::new(
-                conf,
-                http_auth.clone(),
-                remote_storage,
-                broker_client.clone(),
-                disk_usage_eviction_state,
-            )
-            .context("Failed to initialize router state")?,
-        );
-        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
-            .build()
-            .map_err(|err| anyhow!(err))?;
+        let router = http::make_router(
+            conf,
+            launch_ts,
+            http_auth,
+            broker_client.clone(),
+            remote_storage,
+            disk_usage_eviction_state,
+        )?
+        .build()
+        .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -32,8 +32,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
-    TIMELINES_SEGMENT_NAME,
+    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
@@ -73,7 +72,7 @@ pub mod defaults {
    /// Default built-in configuration file.
    ///
    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
-        r#"
+        r###"
 # Initial configuration file created by 'pageserver --init'
 #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
 #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
@@ -118,7 +117,7 @@ pub mod defaults {

 [remote_storage]

-"#
+"###
    );
 }

@@ -205,8 +204,6 @@ pub struct PageServerConf {
    /// has it's initial logical size calculated. Not running background tasks for some seconds is
    /// not terrible.
    pub background_task_maximum_delay: Duration,
-
-    pub control_plane_api: Option<Url>,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -281,8 +278,6 @@ struct PageServerConfigBuilder {
    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,

    background_task_maximum_delay: BuilderValue<Duration>,
-
-    control_plane_api: BuilderValue<Option<Url>>,
 }

 impl Default for PageServerConfigBuilder {
@@ -345,8 +340,6 @@ impl Default for PageServerConfigBuilder {
                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
            )
            .unwrap()),
-
-            control_plane_api: Set(None),
        }
    }
 }
@@ -475,10 +468,6 @@ impl PageServerConfigBuilder {
        self.background_task_maximum_delay = BuilderValue::Set(delay);
    }

-    pub fn control_plane_api(&mut self, api: Url) {
-        self.control_plane_api = BuilderValue::Set(Some(api))
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -564,9 +553,6 @@ impl PageServerConfigBuilder {
            background_task_maximum_delay: self
                .background_task_maximum_delay
                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
-            control_plane_api: self
-                .control_plane_api
-                .ok_or(anyhow!("missing control_plane_api"))?,
        })
    }
 }
@@ -577,7 +563,7 @@ impl PageServerConf {
    //

    pub fn tenants_path(&self) -> PathBuf {
-        self.workdir.join(TENANTS_SEGMENT_NAME)
+        self.workdir.join("tenants")
    }

    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
@@ -657,6 +643,23 @@ impl PageServerConf {
            .join(METADATA_FILE_NAME)
    }

+    /// Files on the remote storage are stored with paths, relative to the workdir.
+    /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
+    ///
+    /// Errors if the path provided does not start from pageserver's workdir.
+    pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
+        local_path
+            .strip_prefix(&self.workdir)
+            .context("Failed to strip workdir prefix")
+            .and_then(RemotePath::new)
+            .with_context(|| {
+                format!(
+                    "Failed to resolve remote part of path {:?} for base {:?}",
+                    local_path, self.workdir
+                )
+            })
+    }
+
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
        remote_path.with_base(&self.workdir)
@@ -668,18 +671,26 @@ impl PageServerConf {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
+        match pg_version {
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
+        }
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
+        match pg_version {
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
+        }
    }

    /// Parse a configuration file (pageserver.toml) into a PageServerConf struct,
@@ -747,7 +758,6 @@ impl PageServerConf {
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -916,7 +926,6 @@ impl PageServerConf {
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
            background_task_maximum_delay: Duration::ZERO,
-            control_plane_api: None,
        }
    }
 }
@@ -1140,7 +1149,6 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: humantime::parse_duration(
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
-                control_plane_api: None
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1196,7 +1204,6 @@ background_task_maximum_delay = '334 s'
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,119 +0,0 @@
-use std::collections::HashMap;
-
-use hyper::StatusCode;
-use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
-use tokio_util::sync::CancellationToken;
-use url::Url;
-use utils::{
-    backoff,
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
-
-use crate::config::PageServerConf;
-
-// Backoffs when control plane requests do not succeed: compromise between reducing load
-// on control plane, and retrying frequently when we are blocked on a control plane
-// response to make progress.
-const BACKOFF_INCREMENT: f64 = 0.1;
-const BACKOFF_MAX: f64 = 10.0;
-
-/// The Pageserver's client for using the control plane API: this is a small subset
-/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub(crate) struct ControlPlaneClient {
-    http_client: reqwest::Client,
-    base_url: Url,
-    node_id: NodeId,
-    cancel: CancellationToken,
-}
-
-impl ControlPlaneClient {
-    /// A None return value indicates that the input `conf` object does not have control
-    /// plane API enabled.
-    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
-        let mut url = match conf.control_plane_api.as_ref() {
-            Some(u) => u.clone(),
-            None => return None,
-        };
-
-        if let Ok(mut segs) = url.path_segments_mut() {
-            // This ensures that `url` ends with a slash if it doesn't already.
-            // That way, we can subsequently use join() to safely attach extra path elements.
-            segs.pop_if_empty().push("");
-        }
-
-        let client = reqwest::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");
-
-        Some(Self {
-            http_client: client,
-            base_url: url,
-            node_id: conf.id,
-            cancel: cancel.clone(),
-        })
-    }
-
-    async fn try_re_attach(
-        &self,
-        url: Url,
-        request: &ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
-        match self.http_client.post(url).json(request).send().await {
-            Err(e) => Err(anyhow::Error::from(e)),
-            Ok(r) => {
-                if r.status() == StatusCode::OK {
-                    r.json::<ReAttachResponse>()
-                        .await
-                        .map_err(anyhow::Error::from)
-                } else {
-                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
-                }
-            }
-        }
-    }
-
-    /// Block until we get a successful response
-    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
-        let re_attach_path = self
-            .base_url
-            .join("re-attach")
-            .expect("Failed to build re-attach path");
-        let request = ReAttachRequest {
-            node_id: self.node_id,
-        };
-
-        let mut attempt = 0;
-        loop {
-            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
-            match result {
-                Ok(res) => {
-                    tracing::info!(
-                        "Received re-attach response with {} tenants",
-                        res.tenants.len()
-                    );
-
-                    return Ok(res
-                        .tenants
-                        .into_iter()
-                        .map(|t| (t.id, Generation::new(t.generation)))
-                        .collect::<HashMap<_, _>>());
-                }
-                Err(e) => {
-                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
-                    backoff::exponential_backoff(
-                        attempt,
-                        BACKOFF_INCREMENT,
-                        BACKOFF_MAX,
-                        &self.cancel,
-                    )
-                    .await;
-                    if self.cancel.is_cancelled() {
-                        return Err(anyhow::anyhow!("Shutting down"));
-                    }
-                    attempt += 1;
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,11 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{
+        self,
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        Timeline,
+    },
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -108,7 +112,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -121,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: GenericRemoteStorage,
+    _storage: &GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
@@ -145,14 +149,8 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                &storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;

            match res {
                Ok(()) => {}
@@ -183,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -273,7 +270,6 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -330,9 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
+    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -349,10 +346,15 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        batched
-            .entry(TimelineKey(candidate.timeline))
-            .or_default()
-            .push(candidate.layer);
+        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+
+        // semaphore will later be used to limit eviction concurrency, and we can express at
+        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+        // but fail gracefully by not making batches larger.
+        if batch.len() < u32::MAX as usize {
+            batch.push(candidate.layer);
+            max_batch_size = max_batch_size.max(batch.len());
+        }
    }

    let usage_planned = match warned {
@@ -369,64 +371,101 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    // After the loop, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
+    let mut js = tokio::task::JoinSet::new();
+
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size = batch.len();
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

        debug!(%timeline_id, "evicting batch for timeline");

-        async {
-            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();

-            match results {
-                Err(e) => {
-                    warn!("failed to evict batch: {:#}", e);
-                }
-                Ok(results) => {
-                    assert_eq!(results.len(), batch.len());
-                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
-                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
-                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            None => {
-                                assert!(cancel.is_cancelled());
-                                return;
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch, &cancel).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
                            }
                        }
                    }
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
+                    }
                }
+                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
-        .await;
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        if cancel.is_cancelled() {
+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
+        (usage_assumed, evictions_failed)
+    };
+
+    let (usage_assumed, evictions_failed) = tokio::select! {
+        tuple = join_all => { tuple },
+        _ = cancel.cancelled() => {
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    }
+    };

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -441,7 +480,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Arc<dyn PersistentLayer>,
+    layer: Layer,
    last_activity_ts: SystemTime,
 }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -383,6 +383,7 @@ paths:
        schema:
          type: string
          format: hex
+
    post:
      description: |
        Schedules attach operation to happen in the background for the given tenant.
@@ -1019,9 +1020,6 @@ components:
      properties:
        config:
          $ref: '#/components/schemas/TenantConfig'
-        generation:
-          type: integer
-          description: Attachment generation number.
    TenantConfigRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,10 +8,9 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
-};
+use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -33,13 +32,11 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
    auth::JwtAuth,
-    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
        error::{ApiError, HttpErrorBody},
@@ -54,7 +51,7 @@ use utils::{
 // Imports only used for testing APIs
 use super::models::ConfigureFailpointsRequest;

-pub struct State {
+struct State {
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
@@ -64,7 +61,7 @@ pub struct State {
 }

 impl State {
-    pub fn new(
+    fn new(
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -285,8 +282,6 @@ async fn build_timeline_info_common(
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

-    let walreceiver_status = timeline.walreceiver_status();
-
    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
        timeline_id: timeline.timeline_id,
@@ -307,8 +302,6 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
-
-        walreceiver_status,
    };
    Ok(info)
 }
@@ -479,7 +472,7 @@ async fn tenant_attach_handler(
    check_permission(&request, Some(tenant_id))?;

    let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
-    let tenant_conf = match &maybe_body {
+    let tenant_conf = match maybe_body {
        Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
        None => TenantConfOpt::default(),
    };
@@ -490,13 +483,10 @@ async fn tenant_attach_handler(

    let state = get_state(&request);

-    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
-
    if let Some(remote_storage) = &state.remote_storage {
        mgr::attach_tenant(
            state.conf,
            tenant_id,
-            generation,
            tenant_conf,
            state.broker_client.clone(),
            remote_storage.clone(),
@@ -548,7 +538,7 @@ async fn tenant_detach_handler(
 }

 async fn tenant_load_handler(
-    mut request: Request<Body>,
+    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -556,18 +546,10 @@ async fn tenant_load_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
-
    let state = get_state(&request);
-
-    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
-    // should always use /attach instead.
-    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
-
    mgr::load_tenant(
        state.conf,
        tenant_id,
-        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
        &ctx,
@@ -869,21 +851,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

-/// Helper for requests that may take a generation, which is mandatory
-/// when control_plane_api is set, but otherwise defaults to Generation::none()
-fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
-    if state.conf.control_plane_api.is_some() {
-        req_gen
-            .map(Generation::new)
-            .ok_or(ApiError::BadRequest(anyhow!(
-                "generation attribute missing"
-            )))
-    } else {
-        // Legacy mode: all tenants operate with no generation
-        Ok(Generation::none())
-    }
-}
-
 async fn tenant_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -900,17 +867,14 @@ async fn tenant_create_handler(
    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    let state = get_state(&request);
-
-    let generation = get_request_generation(state, request_data.generation)?;
-
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

+    let state = get_state(&request);
+
    let new_tenant = mgr::create_tenant(
        state.conf,
        tenant_conf,
        target_tenant_id,
-        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
        &ctx,
@@ -1064,7 +1028,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1089,7 +1053,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, ())
    }
@@ -1196,11 +1160,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.clone() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

    let state = state.disk_usage_eviction_state.clone();

@@ -1218,7 +1182,6 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
-                &storage,
                usage,
                &child_cancel,
            )
@@ -1357,9 +1320,12 @@ where
 }

 pub fn make_router(
-    state: Arc<State>,
+    conf: &'static PageServerConf,
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<JwtAuth>>,
+    broker_client: BrokerClientChannel,
+    remote_storage: Option<GenericRemoteStorage>,
+    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1383,7 +1349,16 @@ pub fn make_router(
    );

    Ok(router
-        .data(state)
+        .data(Arc::new(
+            State::new(
+                conf,
+                auth,
+                remote_storage,
+                broker_client,
+                disk_usage_eviction_state,
+            )
+            .context("Failed to initialize router state")?,
+        ))
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,6 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-mod control_plane_client;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -537,7 +537,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-/// VirtualFile fs operation variants.
+/// Tracks time taken by fs operations near VirtualFile.
 ///
 /// Operations:
 /// - open ([`std::fs::OpenOptions::open`])
@@ -548,66 +548,15 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 /// - seek (modify internal position or file length query)
 /// - fsync ([`std::fs::File::sync_all`])
 /// - metadata ([`std::fs::File::metadata`])
-#[derive(
-    Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
-)]
-pub(crate) enum StorageIoOperation {
-    Open,
-    Close,
-    CloseByReplace,
-    Read,
-    Write,
-    Seek,
-    Fsync,
-    Metadata,
-}
-
-impl StorageIoOperation {
-    pub fn as_str(&self) -> &'static str {
-        match self {
-            StorageIoOperation::Open => "open",
-            StorageIoOperation::Close => "close",
-            StorageIoOperation::CloseByReplace => "close-by-replace",
-            StorageIoOperation::Read => "read",
-            StorageIoOperation::Write => "write",
-            StorageIoOperation::Seek => "seek",
-            StorageIoOperation::Fsync => "fsync",
-            StorageIoOperation::Metadata => "metadata",
-        }
-    }
-}
-
-/// Tracks time taken by fs operations near VirtualFile.
-#[derive(Debug)]
-pub(crate) struct StorageIoTime {
-    metrics: [Histogram; StorageIoOperation::COUNT],
-}
-
-impl StorageIoTime {
-    fn new() -> Self {
-        let storage_io_histogram_vec = register_histogram_vec!(
-            "pageserver_io_operations_seconds",
-            "Time spent in IO operations",
-            &["operation"],
-            STORAGE_IO_TIME_BUCKETS.into()
-        )
-        .expect("failed to define a metric");
-        let metrics = std::array::from_fn(|i| {
-            let op = StorageIoOperation::from_repr(i).unwrap();
-            let metric = storage_io_histogram_vec
-                .get_metric_with_label_values(&[op.as_str()])
-                .unwrap();
-            metric
-        });
-        Self { metrics }
-    }
-
-    pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
-        &self.metrics[op as usize]
-    }
-}
-
-pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
+pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_io_operations_seconds",
+        "Time spent in IO operations",
+        &["operation"],
+        STORAGE_IO_TIME_BUCKETS.into()
+    )
+    .expect("failed to define a metric")
+});

 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];

@@ -1216,12 +1165,6 @@ impl TimelineMetrics {
            ),
        }
    }
-
-    pub fn record_new_file_metrics(&self, sz: u64) {
-        self.resident_physical_size_gauge.add(sz);
-        self.num_persistent_files_created.inc_by(1);
-        self.persistent_bytes_written.inc_by(sz);
-    }
 }

 impl Drop for TimelineMetrics {
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,7 +75,10 @@
 use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
-    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+    sync::{
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
+    },
 };

 use anyhow::Context;
@@ -159,7 +162,7 @@ struct Version {
 }

 struct Slot {
-    inner: tokio::sync::RwLock<SlotInner>,
+    inner: RwLock<SlotInner>,
    usage_count: AtomicU8,
 }

@@ -200,11 +203,6 @@ impl Slot {
            Err(usage_count) => usage_count,
        }
    }
-
-    /// Sets the usage count to a specific value.
-    fn set_usage_count(&self, count: u8) {
-        self.usage_count.store(count, Ordering::Relaxed);
-    }
 }

 pub struct PageCache {
@@ -217,9 +215,9 @@ pub struct PageCache {
    ///
    /// If you add support for caching different kinds of objects, each object kind
    /// can have a separate mapping map, next to this field.
-    materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
+    immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -235,7 +233,7 @@ pub struct PageCache {
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);

 impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];
@@ -262,10 +260,9 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 /// to initialize.
 ///
 pub struct PageWriteGuard<'i> {
-    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+    inner: RwLockWriteGuard<'i, SlotInner>,

    // Are the page contents currently valid?
-    // Used to mark pages as invalid that are assigned but not yet filled with data.
    valid: bool,
 }

@@ -340,7 +337,7 @@ impl PageCache {
    /// The 'lsn' is an upper bound, this will return the latest version of
    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
    /// returned page.
-    pub async fn lookup_materialized_page(
+    pub fn lookup_materialized_page(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -360,7 +357,7 @@ impl PageCache {
            lsn,
        };

-        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
            if let CacheKey::MaterializedPage {
                hash_key: _,
                lsn: available_lsn,
@@ -387,7 +384,7 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
-    pub async fn memorize_materialized_page(
+    pub fn memorize_materialized_page(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -404,7 +401,7 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key).await? {
+        match self.lock_for_write(&cache_key)? {
            WriteBufResult::Found(write_guard) => {
                // We already had it in cache. Another thread must've put it there
                // concurrently. Check that it had the same contents that we
@@ -422,14 +419,31 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with immutable file pages.

-    pub async fn read_immutable_buf(
-        &self,
-        file_id: FileId,
-        blkno: u32,
-    ) -> anyhow::Result<ReadBufResult> {
+    pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

-        self.lock_for_read(&mut cache_key).await
+        self.lock_for_read(&mut cache_key)
+    }
+
+    /// Immediately drop all buffers belonging to given file
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::ImmutableFilePage { file_id, blkno: _ }
+                        if *file_id == drop_file_id =>
+                    {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                    }
+                    _ => {}
+                }
+            }
+        }
    }

    //
@@ -449,14 +463,14 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we released the mapping
            // lock already, another thread could have evicted the page)
            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.read().await;
+            let inner = slot.inner.read().unwrap();
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
                return Some(PageReadGuard(inner));
@@ -497,7 +511,7 @@ impl PageCache {
    /// }
    /// ```
    ///
-    async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
@@ -512,7 +526,7 @@ impl PageCache {
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
+            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
                if is_first_iteration {
                    hit.inc();
                }
@@ -542,7 +556,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
+            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
                inner,
@@ -555,13 +569,13 @@ impl PageCache {
    /// found, returns None.
    ///
    /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we don't released the mapping
            // lock already, another thread could have evicted the page)
            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().await;
+            let inner = slot.inner.write().unwrap();
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
                return Some(PageWriteGuard { inner, valid: true });
@@ -574,10 +588,10 @@ impl PageCache {
    ///
    /// Similar to lock_for_read(), but the returned buffer is write-locked and
    /// may be modified by the caller even if it's already found in the cache.
-    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
+    fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
        loop {
            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
+            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
                return Ok(WriteBufResult::Found(write_guard));
            }

@@ -603,7 +617,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
+            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
                inner,
@@ -758,7 +772,7 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+    fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
@@ -770,7 +784,10 @@ impl PageCache {
            if slot.dec_usage_count() == 0 {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
-                    Err(_err) => {
+                    Err(TryLockError::Poisoned(err)) => {
+                        anyhow::bail!("buffer lock was poisoned: {err:?}")
+                    }
+                    Err(TryLockError::WouldBlock) => {
                        // If we have looped through the whole buffer pool 10 times
                        // and still haven't found a victim buffer, something's wrong.
                        // Maybe all the buffers were in locked. That could happen in
@@ -799,9 +816,6 @@ impl PageCache {
    fn new(num_pages: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        // We could use Vec::leak here, but that potentially also leaks
-        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
-        // this is avoided.
        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
@@ -815,7 +829,7 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
+                    inner: RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -469,9 +469,7 @@ impl PageServerHandler {
        // Create empty timeline
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .await?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -32,7 +32,9 @@ use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
+use std::fs::OpenOptions;
 use std::io;
+use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
@@ -66,7 +68,7 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
-pub use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
@@ -83,7 +85,6 @@ pub use pageserver_api::models::TenantState;
 use toml_edit;
 use utils::{
    crashsafe,
-    generation::Generation,
    id::{TenantId, TimelineId},
    lsn::{Lsn, RecordLsn},
 };
@@ -113,6 +114,7 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
+pub mod manifest;
 mod span;

 pub mod metadata;
@@ -131,9 +133,7 @@ pub(crate) mod timeline;
 pub mod size;

 pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-pub use timeline::{
-    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
-};
+pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;
@@ -141,9 +141,6 @@ pub use crate::tenant::metadata::save_metadata;
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;

-/// The "tenants" part of `tenants/<tenant>/timelines...`
-pub const TENANTS_SEGMENT_NAME: &str = "tenants";
-
 /// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
 pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

@@ -179,11 +176,6 @@ pub struct Tenant {
    tenant_conf: Arc<RwLock<TenantConfOpt>>,

    tenant_id: TenantId,
-
-    /// The remote storage generation, used to protect S3 objects from split-brain.
-    /// Does not change over the lifetime of the [`Tenant`] object.
-    generation: Generation,
-
    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
@@ -195,7 +187,7 @@ pub struct Tenant {
    walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,

    // provides access to timeline data sitting in the remote storage
-    pub(crate) remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: Option<GenericRemoteStorage>,

    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -407,6 +399,7 @@ impl Tenant {
        remote_startup_data: Option<RemoteStartupData>,
        local_metadata: Option<TimelineMetadata>,
        ancestor: Option<Arc<Timeline>>,
+        first_save: bool,
        init_order: Option<&InitializationOrder>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -440,9 +433,14 @@ impl Tenant {

        // Save the metadata file to local disk.
        if !picked_local {
-            save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
-                .await
-                .context("save_metadata")?;
+            save_metadata(
+                self.conf,
+                &tenant_id,
+                &timeline_id,
+                up_to_date_metadata,
+                first_save,
+            )
+            .context("save_metadata")?;
        }

        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
@@ -522,7 +520,6 @@ impl Tenant {
    pub(crate) fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        generation: Generation,
        broker_client: storage_broker::BrokerClientChannel,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        remote_storage: GenericRemoteStorage,
@@ -539,7 +536,6 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            generation,
            Some(remote_storage.clone()),
        ));

@@ -650,8 +646,12 @@ impl Tenant {
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;

-        let remote_timeline_ids =
-            remote_timeline_client::list_remote_timelines(remote_storage, self.tenant_id).await?;
+        let remote_timeline_ids = remote_timeline_client::list_remote_timelines(
+            remote_storage,
+            self.conf,
+            self.tenant_id,
+        )
+        .await?;

        info!("found {} timelines", remote_timeline_ids.len());

@@ -663,7 +663,6 @@ impl Tenant {
                self.conf,
                self.tenant_id,
                timeline_id,
-                self.generation,
            );
            part_downloads.spawn(
                async move {
@@ -827,6 +826,7 @@ impl Tenant {
            }),
            local_metadata,
            ancestor,
+            true,
            None,
            ctx,
        )
@@ -849,7 +849,6 @@ impl Tenant {
            TenantConfOpt::default(),
            wal_redo_manager,
            tenant_id,
-            Generation::broken(),
            None,
        ))
    }
@@ -867,7 +866,6 @@ impl Tenant {
    pub(crate) fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        generation: Generation,
        resources: TenantSharedResources,
        init_order: Option<InitializationOrder>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
@@ -893,7 +891,6 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            generation,
            remote_storage.clone(),
        );
        let tenant = Arc::new(tenant);
@@ -1379,6 +1376,7 @@ impl Tenant {
            remote_startup_data,
            Some(local_metadata),
            ancestor,
+            false,
            init_order,
            ctx,
        )
@@ -1442,7 +1440,7 @@ impl Tenant {
    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
    /// minimum amount of keys required to get a writable timeline.
    /// (Without it, `put` might fail due to `repartition` failing.)
-    pub async fn create_empty_timeline(
+    pub fn create_empty_timeline(
        &self,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
@@ -1454,10 +1452,10 @@ impl Tenant {
            "Cannot create empty timelines on inactive tenant"
        );

-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
-        };
+        let timelines = self.timelines.lock().unwrap();
+        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
+        drop(timelines);
+
        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
            // make it valid, before calling finish_creation()
@@ -1476,7 +1474,6 @@ impl Tenant {
            initdb_lsn,
            None,
        )
-        .await
    }

    /// Helper for unit tests to create an empty timeline.
@@ -1492,9 +1489,7 @@ impl Tenant {
        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_tl = self
-            .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
-            .await?;
+        let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?;
        let tline = uninit_tl.raw_timeline().expect("we just created it");
        assert_eq!(tline.get_last_record_lsn(), Lsn(0));

@@ -1512,15 +1507,6 @@ impl Tenant {
        tline.maybe_spawn_flush_loop();
        tline.freeze_and_flush().await.context("freeze_and_flush")?;

-        // Make sure the freeze_and_flush reaches remote storage.
-        tline
-            .remote_client
-            .as_ref()
-            .unwrap()
-            .wait_completion()
-            .await
-            .unwrap();
-
        let tl = uninit_tl.finish_creation()?;
        // The non-test code would call tl.activate() here.
        tl.set_state(TimelineState::Active);
@@ -1697,6 +1683,65 @@ impl Tenant {
        Ok(())
    }

+    /// Flush all in-memory data to disk and remote storage, if any.
+    ///
+    /// Used at graceful shutdown.
+    async fn freeze_and_flush_on_shutdown(&self) {
+        let mut js = tokio::task::JoinSet::new();
+
+        // execute on each timeline on the JoinSet, join after.
+        let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
+            async move {
+                debug_assert_current_span_has_tenant_and_timeline_id();
+
+                match timeline.freeze_and_flush().await {
+                    Ok(()) => {}
+                    Err(e) => {
+                        warn!("failed to freeze and flush: {e:#}");
+                        return;
+                    }
+                }
+
+                let res = if let Some(client) = timeline.remote_client.as_ref() {
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    client.wait_completion().await
+                } else {
+                    Ok(())
+                };
+
+                if let Err(e) = res {
+                    warn!("failed to await for frozen and flushed uploads: {e:#}");
+                }
+            }
+            .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
+        };
+
+        {
+            let timelines = self.timelines.lock().unwrap();
+            timelines
+                .iter()
+                .map(|(id, tl)| (*id, Arc::clone(tl)))
+                .for_each(|(timeline_id, timeline)| {
+                    js.spawn(per_timeline(timeline_id, timeline));
+                })
+        };
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
+                Err(je) if je.is_panic() => { /* logged already */ }
+                Err(je) => warn!("unexpected JoinError: {je:?}"),
+            }
+        }
+    }
+
    pub fn current_state(&self) -> TenantState {
        self.state.borrow().clone()
    }
@@ -1827,22 +1872,19 @@ impl Tenant {
            }
        };

-        let mut js = tokio::task::JoinSet::new();
-        {
-            let timelines = self.timelines.lock().unwrap();
-            timelines.values().for_each(|timeline| {
-                let timeline = Arc::clone(timeline);
-                let span = Span::current();
-                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
-            })
-        };
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
-                Err(je) if je.is_panic() => { /* logged already */ }
-                Err(je) => warn!("unexpected JoinError: {je:?}"),
-            }
+        if freeze_and_flush {
+            // walreceiver has already began to shutdown with TenantState::Stopping, but we need to
+            // await for them to stop.
+            task_mgr::shutdown_tasks(
+                Some(TaskKind::WalReceiverManager),
+                Some(self.tenant_id),
+                None,
+            )
+            .await;
+
+            // this will wait for uploads to complete; in the past, it was done outside tenant
+            // shutdown in pageserver::shutdown_pageserver.
+            self.freeze_and_flush_on_shutdown().await;
        }

        // shutdown all tenant and timeline tasks: gc, compaction, page service
@@ -2230,7 +2272,6 @@ impl Tenant {
            ancestor,
            new_timeline_id,
            self.tenant_id,
-            self.generation,
            Arc::clone(&self.walredo_mgr),
            resources,
            pg_version,
@@ -2248,7 +2289,6 @@ impl Tenant {
        tenant_conf: TenantConfOpt,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        tenant_id: TenantId,
-        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
    ) -> Tenant {
        let (state, mut rx) = watch::channel(state);
@@ -2307,7 +2347,6 @@ impl Tenant {

        Tenant {
            tenant_id,
-            generation,
            conf,
            // using now here is good enough approximation to catch tenants with really long
            // activation times.
@@ -2369,37 +2408,72 @@ impl Tenant {
        Ok(tenant_conf)
    }

-    #[tracing::instrument(skip_all, fields(%tenant_id))]
-    pub(super) async fn persist_tenant_config(
+    pub(super) fn persist_tenant_config(
        tenant_id: &TenantId,
        target_config_path: &Path,
        tenant_conf: TenantConfOpt,
+        creating_tenant: bool,
    ) -> anyhow::Result<()> {
-        // imitate a try-block with a closure
-        info!("persisting tenantconf to {}", target_config_path.display());
+        let _enter = info_span!("saving tenantconf").entered();

-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
+        // imitate a try-block with a closure
+        let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
+            let target_config_parent = target_config_path.parent().with_context(|| {
+                format!(
+                    "Config path does not have a parent: {}",
+                    target_config_path.display()
+                )
+            })?;
+
+            info!("persisting tenantconf to {}", target_config_path.display());
+
+            let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.

 [tenant_config]
 "#
-        .to_string();
+            .to_string();

-        // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
+            // Convert the config to a toml file.
+            conf_content += &toml_edit::ser::to_string(&tenant_conf)?;

-        let conf_content = conf_content.as_bytes();
+            let mut target_config_file = VirtualFile::open_with_options(
+                target_config_path,
+                OpenOptions::new()
+                    .truncate(true) // This needed for overwriting with small config files
+                    .write(true)
+                    .create_new(creating_tenant)
+                    // when creating a new tenant, first_save will be true and `.create(true)` will be
+                    // ignored (per rust std docs).
+                    //
+                    // later when updating the config of created tenant, or persisting config for the
+                    // first time for attached tenant, the `.create(true)` is used.
+                    .create(true),
+            )?;

-        let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
-        VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
-            .await
-            .with_context(|| {
-                format!(
-                    "write tenant {tenant_id} config to {}",
-                    target_config_path.display()
-                )
-            })?;
-        Ok(())
+            target_config_file
+                .write(conf_content.as_bytes())
+                .context("write toml bytes into file")
+                .and_then(|_| target_config_file.sync_all().context("fsync config file"))
+                .context("write config file")?;
+
+            // fsync the parent directory to ensure the directory entry is durable.
+            // before this was done conditionally on creating_tenant, but these management actions are rare
+            // enough to just fsync it always.
+
+            crashsafe::fsync(target_config_parent)?;
+            // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
+            Ok(())
+        };
+
+        // this function is called from creating the tenant and updating the tenant config, which
+        // would otherwise share this context, so keep it here in one place.
+        do_persist(target_config_path).with_context(|| {
+            format!(
+                "write tenant {tenant_id} config to {}",
+                target_config_path.display()
+            )
+        })
    }

    //
@@ -2710,15 +2784,13 @@ impl Tenant {
            src_timeline.pg_version,
        );

-        let uninitialized_timeline = self
-            .prepare_new_timeline(
-                dst_id,
-                &metadata,
-                timeline_uninit_mark,
-                start_lsn + 1,
-                Some(Arc::clone(src_timeline)),
-            )
-            .await?;
+        let uninitialized_timeline = self.prepare_new_timeline(
+            dst_id,
+            &metadata,
+            timeline_uninit_mark,
+            start_lsn + 1,
+            Some(Arc::clone(src_timeline)),
+        )?;

        let new_timeline = uninitialized_timeline.finish_creation()?;

@@ -2796,15 +2868,13 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let raw_timeline = self
-            .prepare_new_timeline(
-                timeline_id,
-                &new_metadata,
-                timeline_uninit_mark,
-                pgdata_lsn,
-                None,
-            )
-            .await?;
+        let raw_timeline = self.prepare_new_timeline(
+            timeline_id,
+            &new_metadata,
+            timeline_uninit_mark,
+            pgdata_lsn,
+            None,
+        )?;

        let tenant_id = raw_timeline.owning_tenant.tenant_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2859,7 +2929,6 @@ impl Tenant {
                self.conf,
                self.tenant_id,
                timeline_id,
-                self.generation,
            );
            Some(remote_client)
        } else {
@@ -2875,7 +2944,7 @@ impl Tenant {
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
    /// uninit mark file.
-    async fn prepare_new_timeline(
+    fn prepare_new_timeline(
        &self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
@@ -2903,9 +2972,8 @@ impl Tenant {

        timeline_struct.init_empty_layer_map(start_lsn);

-        if let Err(e) = self
-            .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
-            .await
+        if let Err(e) =
+            self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
        {
            error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
            cleanup_timeline_directory(uninit_mark);
@@ -2921,7 +2989,7 @@ impl Tenant {
        ))
    }

-    async fn create_timeline_files(
+    fn create_timeline_files(
        &self,
        timeline_path: &Path,
        new_timeline_id: &TimelineId,
@@ -2933,9 +3001,14 @@ impl Tenant {
            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
        });

-        save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata)
-            .await
-            .context("Failed to create timeline metadata")?;
+        save_metadata(
+            self.conf,
+            &self.tenant_id,
+            new_timeline_id,
+            new_metadata,
+            true,
+        )
+        .context("Failed to create timeline metadata")?;
        Ok(())
    }

@@ -3082,7 +3155,7 @@ pub(crate) enum CreateTenantFilesMode {
    Attach,
 }

-pub(crate) async fn create_tenant_files(
+pub(crate) fn create_tenant_files(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
@@ -3118,8 +3191,7 @@ pub(crate) async fn create_tenant_files(
        mode,
        &temporary_tenant_dir,
        &target_tenant_directory,
-    )
-    .await;
+    );

    if creation_result.is_err() {
        error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
@@ -3137,7 +3209,7 @@ pub(crate) async fn create_tenant_files(
    Ok(target_tenant_directory)
 }

-async fn try_create_target_tenant_dir(
+fn try_create_target_tenant_dir(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
@@ -3176,7 +3248,7 @@ async fn try_create_target_tenant_dir(
    )
    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;

-    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;
+    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;

    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
@@ -3380,9 +3452,6 @@ pub mod harness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
        pub tenant_id: TenantId,
-        pub generation: Generation,
-        pub remote_storage: GenericRemoteStorage,
-        pub remote_fs_dir: PathBuf,
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3420,39 +3489,28 @@ pub mod harness {
            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
            fs::create_dir_all(conf.timelines_path(&tenant_id))?;

-            use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-            let remote_fs_dir = conf.workdir.join("localfs");
-            std::fs::create_dir_all(&remote_fs_dir).unwrap();
-            let config = RemoteStorageConfig {
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
-                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-            };
-            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
-
            Ok(Self {
                conf,
                tenant_conf,
                tenant_id,
-                generation: Generation::new(0xdeadbeef),
-                remote_storage,
-                remote_fs_dir,
            })
        }

        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
-                self.try_load(&ctx)
+                self.try_load(&ctx, None)
                    .await
                    .expect("failed to load test tenant"),
                ctx,
            )
        }

-        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(
+            &self,
+            ctx: &RequestContext,
+            remote_storage: Option<remote_storage::GenericRemoteStorage>,
+        ) -> anyhow::Result<Arc<Tenant>> {
            let walredo_mgr = Arc::new(TestRedoManager);

            let tenant = Arc::new(Tenant::new(
@@ -3461,8 +3519,7 @@ pub mod harness {
                TenantConfOpt::from(self.tenant_conf),
                walredo_mgr,
                self.tenant_id,
-                self.generation,
-                Some(self.remote_storage.clone()),
+                remote_storage,
            ));
            tenant
                .load(None, ctx)
@@ -3575,10 +3632,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        match tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-        {
+        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) {
            Ok(_) => panic!("duplicate timeline creation should fail"),
            Err(e) => assert_eq!(
                e.to_string(),
@@ -3933,13 +3987,6 @@ mod tests {
                .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
                .await?;
            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
-            // so that all uploads finish & we can call harness.load() below again
-            tenant
-                .shutdown(Default::default(), true)
-                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
-                .await
-                .ok()
-                .unwrap();
        }

        let (tenant, _ctx) = harness.load().await;
@@ -3973,14 +4020,6 @@ mod tests {
                .expect("Should have a local timeline");

            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
-
-            // so that all uploads finish & we can call harness.load() below again
-            tenant
-                .shutdown(Default::default(), true)
-                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
-                .await
-                .ok()
-                .unwrap();
        }

        // check that both of them are initially unloaded
@@ -4000,6 +4039,7 @@ mod tests {

    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
+        use storage_layer::AsLayerDesc;
        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -4007,16 +4047,18 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        let layer_map = tline.layers.read().await;
-        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
+        let level0_deltas = layer_map
+            .layer_map()
+            .get_level0_deltas()?
+            .into_iter()
+            .map(|desc| layer_map.get_from_desc(&desc))
+            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());

        for delta in level0_deltas {
-            let delta = layer_map.get_from_desc(&delta);
            // Ensure we are dumping a delta layer here
-            let delta = delta.downcast_delta_layer().unwrap();
-
-            delta.dump(false, &ctx).await.unwrap();
+            assert!(delta.layer_desc().is_delta);
            delta.dump(true, &ctx).await.unwrap();
        }

@@ -4033,13 +4075,6 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        drop(tline);
-        // so that all uploads finish & we can call harness.try_load() below again
-        tenant
-            .shutdown(Default::default(), true)
-            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
-            .await
-            .ok()
-            .unwrap();
        drop(tenant);

        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -4051,7 +4086,11 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness.try_load(&ctx).await.err().expect("should fail");
+        let err = harness
+            .try_load(&ctx, None)
+            .await
+            .err()
+            .expect("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
@@ -4436,9 +4475,8 @@ mod tests {
            .await;

        let initdb_lsn = Lsn(0x20);
-        let utline = tenant
-            .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let utline =
+            tenant.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)?;
        let tline = utline.raw_timeline().unwrap();

        // Spawn flush loop now so that we can set the `expect_initdb_optimization`
@@ -4503,15 +4541,9 @@ mod tests {
        let harness = TenantHarness::create(name)?;
        {
            let (tenant, ctx) = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
-                .await?;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
            // Keeps uninit mark in place
-            let raw_tline = tline.raw_timeline().unwrap();
-            raw_tline
-                .shutdown(false)
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
-                .await;
            std::mem::forget(tline);
        }

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -13,7 +13,6 @@
 //!
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
-use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

@@ -34,7 +33,7 @@ impl<'a> BlockCursor<'a> {
        let mut blknum = (offset / PAGE_SZ as u64) as u32;
        let mut off = (offset % PAGE_SZ as u64) as usize;

-        let mut buf = self.read_blk(blknum).await?;
+        let mut buf = self.read_blk(blknum)?;

        // peek at the first byte, to determine if it's a 1- or 4-byte length
        let first_len_byte = buf[off];
@@ -50,7 +49,7 @@ impl<'a> BlockCursor<'a> {
                // it is split across two pages
                len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum)?;
                len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
                off = 4 - thislen;
            } else {
@@ -71,7 +70,7 @@ impl<'a> BlockCursor<'a> {
            if page_remain == 0 {
                // continue on next page
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum)?;
                off = 0;
                page_remain = PAGE_SZ;
            }
@@ -84,24 +83,35 @@ impl<'a> BlockCursor<'a> {
    }
 }

-/// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
-/// If a `BlobWriter` is dropped, the internal buffer will be
-/// discarded. You need to call [`flush_buffer`](Self::flush_buffer)
-/// manually before dropping.
-pub struct BlobWriter<const BUFFERED: bool> {
-    inner: VirtualFile,
-    offset: u64,
-    /// A buffer to save on write calls, only used if BUFFERED=true
-    buf: Vec<u8>,
+/// Abstract trait for a data sink that you can write blobs to.
+///
+pub trait BlobWriter {
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error>;
 }

-impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
-    pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
-        Self {
+///
+/// An implementation of BlobWriter to write blobs to anything that
+/// implements std::io::Write.
+///
+pub struct WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    inner: W,
+    offset: u64,
+}
+
+impl<W> WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    pub fn new(inner: W, start_offset: u64) -> Self {
+        WriteBlobWriter {
            inner,
            offset: start_offset,
-            buf: Vec::with_capacity(Self::CAPACITY),
        }
    }

@@ -109,79 +119,28 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        self.offset
    }

-    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
-
-    #[inline(always)]
-    /// Writes the given buffer directly to the underlying `VirtualFile`.
-    /// You need to make sure that the internal buffer is empty, otherwise
-    /// data will be written in wrong order.
-    async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
-        self.inner.write_all(src_buf).await?;
-        self.offset += src_buf.len() as u64;
-        Ok(())
+    /// Access the underlying Write object.
+    ///
+    /// NOTE: WriteBlobWriter keeps track of the current write offset. If
+    /// you write something directly to the inner Write object, it makes the
+    /// internally tracked 'offset' to go out of sync. So don't do that.
+    pub fn into_inner(self) -> W {
+        self.inner
    }
+}

-    #[inline(always)]
-    /// Flushes the internal buffer to the underlying `VirtualFile`.
-    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
-        self.inner.write_all(&self.buf).await?;
-        self.buf.clear();
-        Ok(())
-    }
-
-    #[inline(always)]
-    /// Writes as much of `src_buf` into the internal buffer as it fits
-    fn write_into_buffer(&mut self, src_buf: &[u8]) -> usize {
-        let remaining = Self::CAPACITY - self.buf.len();
-        let to_copy = src_buf.len().min(remaining);
-        self.buf.extend_from_slice(&src_buf[..to_copy]);
-        self.offset += to_copy as u64;
-        to_copy
-    }
-
-    /// Internal, possibly buffered, write function
-    async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
-        if !BUFFERED {
-            assert!(self.buf.is_empty());
-            self.write_all_unbuffered(src_buf).await?;
-            return Ok(());
-        }
-        let remaining = Self::CAPACITY - self.buf.len();
-        // First try to copy as much as we can into the buffer
-        if remaining > 0 {
-            let copied = self.write_into_buffer(src_buf);
-            src_buf = &src_buf[copied..];
-        }
-        // Then, if the buffer is full, flush it out
-        if self.buf.len() == Self::CAPACITY {
-            self.flush_buffer().await?;
-        }
-        // Finally, write the tail of src_buf:
-        // If it wholly fits into the buffer without
-        // completely filling it, then put it there.
-        // If not, write it out directly.
-        if !src_buf.is_empty() {
-            assert_eq!(self.buf.len(), 0);
-            if src_buf.len() < Self::CAPACITY {
-                let copied = self.write_into_buffer(src_buf);
-                // We just verified above that src_buf fits into our internal buffer.
-                assert_eq!(copied, src_buf.len());
-            } else {
-                self.write_all_unbuffered(src_buf).await?;
-            }
-        }
-        Ok(())
-    }
-
-    /// Write a blob of data. Returns the offset that it was written to,
-    /// which can be used to retrieve the data later.
-    pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+impl<W> BlobWriter for WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
        let offset = self.offset;

        if srcbuf.len() < 128 {
            // Short blob. Write a 1-byte length header
            let len_buf = srcbuf.len() as u8;
-            self.write_all(&[len_buf]).await?;
+            self.inner.write_all(&[len_buf])?;
+            self.offset += 1;
        } else {
            // Write a 4-byte length header
            if srcbuf.len() > 0x7fff_ffff {
@@ -192,153 +151,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            }
            let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
            len_buf[0] |= 0x80;
-            self.write_all(&len_buf).await?;
+            self.inner.write_all(&len_buf)?;
+            self.offset += 4;
        }
-        self.write_all(srcbuf).await?;
+        self.inner.write_all(srcbuf)?;
+        self.offset += srcbuf.len() as u64;
        Ok(offset)
    }
 }
-
-impl BlobWriter<true> {
-    /// Access the underlying `VirtualFile`.
-    ///
-    /// This function flushes the internal buffer before giving access
-    /// to the underlying `VirtualFile`.
-    pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
-        self.flush_buffer().await?;
-        Ok(self.inner)
-    }
-
-    /// Access the underlying `VirtualFile`.
-    ///
-    /// Unlike [`into_inner`](Self::into_inner), this doesn't flush
-    /// the internal buffer before giving access.
-    pub fn into_inner_no_flush(self) -> VirtualFile {
-        self.inner
-    }
-}
-
-impl BlobWriter<false> {
-    /// Access the underlying `VirtualFile`.
-    pub fn into_inner(self) -> VirtualFile {
-        self.inner
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::tenant::block_io::BlockReaderRef;
-    use rand::{Rng, SeedableRng};
-
-    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        let temp_dir = tempfile::tempdir()?;
-        let path = temp_dir.path().join("file");
-
-        // Write part (in block to drop the file)
-        let mut offsets = Vec::new();
-        {
-            let file = VirtualFile::create(&path).await?;
-            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
-            for blob in blobs.iter() {
-                let offs = wtr.write_blob(blob).await?;
-                offsets.push(offs);
-            }
-            // Write out one page worth of zeros so that we can
-            // read again with read_blk
-            let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
-            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer().await?;
-        }
-
-        let file = VirtualFile::open(&path).await?;
-        let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new(rdr);
-        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
-            let blob_read = rdr.read_blob(*offset).await?;
-            assert_eq!(
-                blob, &blob_read,
-                "mismatch for idx={idx} at offset={offset}"
-            );
-        }
-        Ok(())
-    }
-
-    fn random_array(len: usize) -> Vec<u8> {
-        let mut rng = rand::thread_rng();
-        (0..len).map(|_| rng.gen()).collect::<_>()
-    }
-
-    #[tokio::test]
-    async fn test_one() -> Result<(), Error> {
-        let blobs = &[vec![12, 21, 22]];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_hello_simple() -> Result<(), Error> {
-        let blobs = &[
-            vec![0, 1, 2, 3],
-            b"Hello, World!".to_vec(),
-            Vec::new(),
-            b"foobar".to_vec(),
-        ];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_really_big_array() -> Result<(), Error> {
-        let blobs = &[
-            b"test".to_vec(),
-            random_array(10 * PAGE_SZ),
-            b"foobar".to_vec(),
-        ];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_arrays_inc() -> Result<(), Error> {
-        let blobs = (0..PAGE_SZ / 8)
-            .map(|v| random_array(v * 16))
-            .collect::<Vec<_>>();
-        round_trip_test::<false>(&blobs).await?;
-        round_trip_test::<true>(&blobs).await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_arrays_random_size() -> Result<(), Error> {
-        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
-        let blobs = (0..1024)
-            .map(|_| {
-                let mut sz: u16 = rng.gen();
-                // Make 50% of the arrays small
-                if rng.gen() {
-                    sz |= 63;
-                }
-                random_array(sz.into())
-            })
-            .collect::<Vec<_>>();
-        round_trip_test::<false>(&blobs).await?;
-        round_trip_test::<true>(&blobs).await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_arrays_page_boundary() -> Result<(), Error> {
-        let blobs = &[
-            random_array(PAGE_SZ - 4),
-            random_array(PAGE_SZ - 4),
-            random_array(PAGE_SZ - 4),
-        ];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
-        Ok(())
-    }
-}
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -7,7 +7,9 @@ use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
+use std::fs::File;
 use std::ops::{Deref, DerefMut};
+use std::os::unix::fs::FileExt;

 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -37,7 +39,7 @@ pub enum BlockLease<'a> {
    PageReadGuard(PageReadGuard<'static>),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
-    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
+    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
 }

 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -47,9 +49,9 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
 }

 #[cfg(test)]
-impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
-    fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
-        BlockLease::Arc(value)
+impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Rc(value)
    }
 }

@@ -61,7 +63,7 @@ impl<'a> Deref for BlockLease<'a> {
            BlockLease::PageReadGuard(v) => v.deref(),
            BlockLease::EphemeralFileMutableTail(v) => v,
            #[cfg(test)]
-            BlockLease::Arc(v) => v.deref(),
+            BlockLease::Rc(v) => v.deref(),
        }
    }
 }
@@ -71,27 +73,25 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReader(&'a FileBlockReader),
+    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
+    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
-    #[cfg(test)]
-    VirtualFile(&'a VirtualFile),
 }

 impl<'a> BlockReaderRef<'a> {
    #[inline(always)]
-    async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        use BlockReaderRef::*;
        match self {
-            FileBlockReader(r) => r.read_blk(blknum).await,
-            EphemeralFile(r) => r.read_blk(blknum).await,
-            Adapter(r) => r.read_blk(blknum).await,
+            FileBlockReaderVirtual(r) => r.read_blk(blknum),
+            FileBlockReaderFile(r) => r.read_blk(blknum),
+            EphemeralFile(r) => r.read_blk(blknum),
+            Adapter(r) => r.read_blk(blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
-            #[cfg(test)]
-            VirtualFile(r) => r.read_blk(blknum).await,
        }
    }
 }
@@ -105,7 +105,7 @@ impl<'a> BlockReaderRef<'a> {
 ///
 /// ```no_run
 /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
-/// # let reader: FileBlockReader = unimplemented!("stub");
+/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
 /// let cursor = reader.block_cursor();
 /// let buf = cursor.read_blk(1);
 /// // do stuff with 'buf'
@@ -122,9 +122,9 @@ impl<'a> BlockCursor<'a> {
        BlockCursor { reader }
    }
    // Needed by cli
-    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
+    pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
        BlockCursor {
-            reader: BlockReaderRef::FileBlockReader(reader),
+            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
        }
    }

@@ -134,8 +134,8 @@ impl<'a> BlockCursor<'a> {
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    #[inline(always)]
-    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.reader.read_blk(blknum).await
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum)
    }
 }

@@ -143,38 +143,38 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader {
-    pub file: VirtualFile,
+pub struct FileBlockReader<F> {
+    pub file: F,

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
 }

-impl FileBlockReader {
-    pub fn new(file: VirtualFile) -> Self {
+impl<F> FileBlockReader<F>
+where
+    F: FileExt,
+{
+    pub fn new(file: F) -> Self {
        let file_id = page_cache::next_file_id();

        FileBlockReader { file_id, file }
    }

    /// Read a page from the underlying file into given buffer.
-    async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
        assert!(buf.len() == PAGE_SZ);
-        self.file
-            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
-            .await
+        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
    }
    /// Read a block.
    ///
    /// Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
                .read_immutable_buf(self.file_id, blknum)
-                .await
                .map_err(|e| {
                    std::io::Error::new(
                        std::io::ErrorKind::Other,
@@ -184,7 +184,7 @@ impl FileBlockReader {
                ReadBufResult::Found(guard) => break Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
                    write_guard.mark_valid();

                    // Swap for read lock
@@ -195,9 +195,15 @@ impl FileBlockReader {
    }
 }

-impl BlockReader for FileBlockReader {
+impl BlockReader for FileBlockReader<File> {
    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
+        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
+    }
+}
+
+impl BlockReader for FileBlockReader<VirtualFile> {
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
    }
 }

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where
        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
+            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum)?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -357,7 +357,7 @@ where
        let block_cursor = self.reader.block_cursor();

        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum)?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -704,7 +704,7 @@ pub(crate) mod tests {
        pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::sync::Arc::new(buf).into())
+            Ok(std::rc::Rc::new(buf).into())
        }
    }
    impl BlockReader for TestDisk {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -9,6 +9,7 @@ use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
+use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
@@ -28,7 +29,7 @@ pub struct EphemeralFile {
 }

 impl EphemeralFile {
-    pub async fn create(
+    pub fn create(
        conf: &PageServerConf,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -44,8 +45,7 @@ impl EphemeralFile {
        let file = VirtualFile::open_with_options(
            &filename,
            OpenOptions::new().read(true).write(true).create(true),
-        )
-        .await?;
+        )?;

        Ok(EphemeralFile {
            page_cache_file_id: page_cache::next_file_id(),
@@ -61,14 +61,13 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
            loop {
                match cache
                    .read_immutable_buf(self.page_cache_file_id, blknum)
-                    .await
                    .map_err(|e| {
                        std::io::Error::new(
                            std::io::ErrorKind::Other,
@@ -88,8 +87,7 @@ impl EphemeralFile {
                        let buf: &mut [u8] = write_guard.deref_mut();
                        debug_assert_eq!(buf.len(), PAGE_SZ);
                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                            .await?;
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
                        write_guard.mark_valid();

                        // Swap for read lock
@@ -129,26 +127,18 @@ impl EphemeralFile {
                    self.off += n;
                    src_remaining = &src_remaining[n..];
                    if self.off == PAGE_SZ {
-                        match self
-                            .ephemeral_file
-                            .file
-                            .write_all_at(
-                                &self.ephemeral_file.mutable_tail,
-                                self.blknum as u64 * PAGE_SZ as u64,
-                            )
-                            .await
-                        {
+                        match self.ephemeral_file.file.write_all_at(
+                            &self.ephemeral_file.mutable_tail,
+                            self.blknum as u64 * PAGE_SZ as u64,
+                        ) {
                            Ok(_) => {
                                // Pre-warm the page cache with what we just wrote.
                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
                                let cache = page_cache::get();
-                                match cache
-                                    .read_immutable_buf(
-                                        self.ephemeral_file.page_cache_file_id,
-                                        self.blknum,
-                                    )
-                                    .await
-                                {
+                                match cache.read_immutable_buf(
+                                    self.ephemeral_file.page_cache_file_id,
+                                    self.blknum,
+                                ) {
                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
@@ -231,8 +221,9 @@ pub fn is_ephemeral_file(filename: &str) -> bool {

 impl Drop for EphemeralFile {
    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+        // drop all pages from page cache
+        let cache = page_cache::get();
+        cache.drop_buffers_for_immutable(self.page_cache_file_id);

        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
@@ -287,7 +278,7 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

        let pos_foo = file.write_blob(b"foo").await?;
        assert_eq!(
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,147 +639,10 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for layer in self.iter_historic_layers() {
-            layer.dump(verbose, ctx)?;
+        for desc in self.iter_historic_layers() {
+            desc.dump();
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::LayerFileName;
-    use std::str::FromStr;
-    use std::sync::Arc;
-
-    mod l0_delta_layers_updated {
-
-        use crate::tenant::{
-            storage_layer::{AsLayerDesc, PersistentLayerDesc},
-            timeline::layer_manager::LayerFileManager,
-        };
-
-        use super::*;
-
-        struct LayerObject(PersistentLayerDesc);
-
-        impl AsLayerDesc for LayerObject {
-            fn layer_desc(&self) -> &PersistentLayerDesc {
-                &self.0
-            }
-        }
-
-        impl LayerObject {
-            fn new(desc: PersistentLayerDesc) -> Self {
-                LayerObject(desc)
-            }
-        }
-
-        type TestLayerFileManager = LayerFileManager<LayerObject>;
-
-        #[test]
-        fn for_full_range_delta() {
-            // l0_delta_layers are used by compaction, and should observe all buffered updates
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
-        }
-
-        #[test]
-        fn for_non_full_range_delta() {
-            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
-        }
-
-        #[test]
-        fn for_image() {
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
-        }
-
-        #[test]
-        fn replacing_missing_l0_is_notfound() {
-            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
-            // however only happen for precondition failures.
-
-            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
-            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = PersistentLayerDesc::from(layer);
-
-            // same skeletan construction; see scenario below
-            let not_found = Arc::new(LayerObject::new(layer.clone()));
-            let new_version = Arc::new(LayerObject::new(layer));
-
-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
-
-            let mut mapping = TestLayerFileManager::new();
-
-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
-        }
-
-        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
-            let name = LayerFileName::from_str(layer_name).unwrap();
-            let skeleton = PersistentLayerDesc::from(name);
-
-            let remote = Arc::new(LayerObject::new(skeleton.clone()));
-            let downloaded = Arc::new(LayerObject::new(skeleton));
-
-            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();
-
-            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
-            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
-
-            let expected_in_counts = (1, usize::from(expected_l0));
-
-            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
-            );
-
-            mapping
-                .replace_and_verify(remote, downloaded.clone())
-                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
-            );
-
-            map.batch_update().remove_historic(downloaded.layer_desc());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
-        }
-
-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
-            let historic = map
-                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
-                .count();
-            let l0s = map
-                .get_level0_deltas()
-                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
-
-            (historic, l0)
-        }
-    }
-}
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -0,0 +1,325 @@
+//! This module contains the encoding and decoding of the local manifest file.
+//!
+//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
+//! records the state of the storage engine. It contains a snapshot of the
+//! state and all operations proceeding that snapshot. The file begins with a
+//! header recording MANIFEST version number. After that, it contains a snapshot.
+//! The snapshot is followed by a list of operations. Each operation is a list
+//! of records. Each record is either an addition or a removal of a layer.
+//!
+//! With MANIFEST, we can:
+//!
+//! 1. recover state quickly by reading the file, potentially boosting the
+//!    startup speed.
+//! 2. ensure all operations are atomic and avoid corruption, solving issues
+//!    like redundant image layer and preparing us for future compaction
+//!    strategies.
+//!
+//! There is also a format for storing all layer files on S3, called
+//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
+//! records all operations as logs, and therefore we can easily replay the
+//! operations when recovering from crash, while ensuring those operations
+//! are atomic upon restart.
+//!
+//! Currently, this is not used in the system. Future refactors will ensure
+//! the storage state will be recorded in this file, and the system can be
+//! recovered from this file. This is tracked in
+//! <https://github.com/neondatabase/neon/issues/4418>
+
+use std::io::{self, Read, Write};
+
+use crate::virtual_file::VirtualFile;
+use anyhow::Result;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crc32c::crc32c;
+use serde::{Deserialize, Serialize};
+use tracing::log::warn;
+use utils::lsn::Lsn;
+
+use super::storage_layer::PersistentLayerDesc;
+
+pub struct Manifest {
+    file: VirtualFile,
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct Snapshot {
+    pub layers: Vec<PersistentLayerDesc>,
+}
+
+/// serde by default encode this in tagged enum, and therefore it will be something
+/// like `{ "AddLayer": { ... } }`.
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Record {
+    AddLayer(PersistentLayerDesc),
+    RemoveLayer(PersistentLayerDesc),
+}
+
+/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
+const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
+const MANIFEST_VERSION: u64 = 1;
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct ManifestHeader {
+    magic_number: u64,
+    version: u64,
+}
+
+const MANIFEST_HEADER_LEN: usize = 16;
+
+impl ManifestHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
+        buf.put_u64(self.magic_number);
+        buf.put_u64(self.version);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
+        Self {
+            magic_number: buf.get_u64(),
+            version: buf.get_u64(),
+        }
+    }
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Operation {
+    /// A snapshot of the current state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
+    Snapshot(Snapshot, Lsn),
+    /// An atomic operation that changes the state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
+    /// This will only change when new L0 is flushed to the disk.
+    Operation(Vec<Record>, Lsn),
+}
+
+struct RecordHeader {
+    size: u32,
+    checksum: u32,
+}
+
+const RECORD_HEADER_LEN: usize = 8;
+
+impl RecordHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
+        buf.put_u32(self.size);
+        buf.put_u32(self.checksum);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
+        Self {
+            size: buf.get_u32(),
+            checksum: buf.get_u32(),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ManifestLoadError {
+    #[error("manifest header is corrupted")]
+    CorruptedManifestHeader,
+    #[error("unsupported manifest version: got {0}, expected {1}")]
+    UnsupportedVersion(u64, u64),
+    #[error("error when decoding record: {0}")]
+    DecodeRecord(serde_json::Error),
+    #[error("I/O error: {0}")]
+    Io(io::Error),
+}
+
+#[must_use = "Should check if the manifest is partially corrupted"]
+pub struct ManifestPartiallyCorrupted(bool);
+
+impl Manifest {
+    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
+    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
+        let mut manifest = Self { file };
+        manifest.append_manifest_header(ManifestHeader {
+            magic_number: MANIFEST_MAGIC_NUMBER,
+            version: MANIFEST_VERSION,
+        })?;
+        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
+        Ok(manifest)
+    }
+
+    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
+    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
+    /// backup the current one.
+    pub fn load(
+        mut file: VirtualFile,
+    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
+        let mut buf = vec![];
+        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
+
+        // Read manifest header
+        let mut buf = Bytes::from(buf);
+        if buf.remaining() < MANIFEST_HEADER_LEN {
+            return Err(ManifestLoadError::CorruptedManifestHeader);
+        }
+        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
+        buf.advance(MANIFEST_HEADER_LEN);
+        if header.version != MANIFEST_VERSION {
+            return Err(ManifestLoadError::UnsupportedVersion(
+                header.version,
+                MANIFEST_VERSION,
+            ));
+        }
+
+        // Read operations
+        let mut operations = Vec::new();
+        let corrupted = loop {
+            if buf.remaining() == 0 {
+                break false;
+            }
+            if buf.remaining() < RECORD_HEADER_LEN {
+                warn!("incomplete header when decoding manifest, could be corrupted");
+                break true;
+            }
+            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
+            let size = size as usize;
+            buf.advance(RECORD_HEADER_LEN);
+            if buf.remaining() < size {
+                warn!("incomplete data when decoding manifest, could be corrupted");
+                break true;
+            }
+            let data = &buf[..size];
+            if crc32c(data) != checksum {
+                warn!("checksum mismatch when decoding manifest, could be corrupted");
+                break true;
+            }
+            // if the following decode fails, we cannot use the manifest or safely ignore any record.
+            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
+            buf.advance(size);
+        };
+        Ok((
+            Self { file },
+            operations,
+            ManifestPartiallyCorrupted(corrupted),
+        ))
+    }
+
+    fn append_data(&mut self, data: &[u8]) -> Result<()> {
+        if data.len() >= u32::MAX as usize {
+            panic!("data too large");
+        }
+        let header = RecordHeader {
+            size: data.len() as u32,
+            checksum: crc32c(data),
+        };
+        let header = header.encode();
+        self.file.write_all(&header)?;
+        self.file.write_all(data)?;
+        self.file.sync_all()?;
+        Ok(())
+    }
+
+    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
+        let encoded = header.encode();
+        self.file.write_all(&encoded)?;
+        Ok(())
+    }
+
+    /// Add an operation to the manifest. The operation will be appended to the end of the file,
+    /// and the file will fsync.
+    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
+        let encoded = Vec::from(serde_json::to_string(&operation)?);
+        self.append_data(&encoded)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::OpenOptions;
+
+    use crate::repository::Key;
+
+    use super::*;
+
+    #[test]
+    fn test_read_manifest() {
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
+        std::fs::create_dir_all(&testdir).unwrap();
+        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
+        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
+        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
+        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
+        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
+
+        // Write a manifest with a snapshot and some operations
+        let snapshot = Snapshot {
+            layers: vec![layer1, layer2],
+        };
+        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
+        manifest
+            .append_operation(Operation::Operation(
+                vec![Record::AddLayer(layer3.clone())],
+                Lsn::from(1),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the second time and write
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 2);
+        assert_eq!(
+            &operations[0],
+            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
+        );
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        manifest
+            .append_operation(Operation::Operation(
+                vec![
+                    Record::RemoveLayer(layer3.clone()),
+                    Record::AddLayer(layer4.clone()),
+                ],
+                Lsn::from(2),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the third time and verify
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 3);
+        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        assert_eq!(
+            &operations[2],
+            &Operation::Operation(
+                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
+                Lsn::from(2)
+            )
+        );
+    }
+}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -8,13 +8,14 @@
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client

-use std::io::{self};
+use std::fs::{File, OpenOptions};
+use std::io::{self, Write};

-use anyhow::{ensure, Context};
+use anyhow::{bail, ensure, Context};
 use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
+use tracing::info_span;
 use utils::bin_ser::SerializeError;
-use utils::crashsafe::path_with_suffix_extension;
 use utils::{
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
@@ -23,7 +24,6 @@ use utils::{

 use crate::config::PageServerConf;
 use crate::virtual_file::VirtualFile;
-use crate::TEMP_FILE_SUFFIX;

 /// Use special format number to enable backward compatibility.
 const METADATA_FORMAT_VERSION: u16 = 4;
@@ -230,23 +230,6 @@ impl TimelineMetadata {
    pub fn pg_version(&self) -> u32 {
        self.body.pg_version
    }
-
-    // Checksums make it awkward to build a valid instance by hand.  This helper
-    // provides a TimelineMetadata with a valid checksum in its header.
-    #[cfg(test)]
-    pub fn example() -> Self {
-        let instance = Self::new(
-            "0/16960E8".parse::<Lsn>().unwrap(),
-            None,
-            None,
-            Lsn::from_hex("00000000").unwrap(),
-            Lsn::from_hex("00000000").unwrap(),
-            Lsn::from_hex("00000000").unwrap(),
-            0,
-        );
-        let bytes = instance.to_bytes().unwrap();
-        Self::from_bytes(&bytes).unwrap()
-    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -272,19 +255,38 @@ impl Serialize for TimelineMetadata {
 }

 /// Save timeline metadata to file
-#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
-pub async fn save_metadata(
+pub fn save_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    data: &TimelineMetadata,
+    first_save: bool,
 ) -> anyhow::Result<()> {
+    let _enter = info_span!("saving metadata").entered();
    let path = conf.metadata_path(tenant_id, timeline_id);
-    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
-    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
-        .await
-        .context("write metadata")?;
+    // use OpenOptions to ensure file presence is consistent with first_save
+    let mut file = VirtualFile::open_with_options(
+        &path,
+        OpenOptions::new().write(true).create_new(first_save),
+    )
+    .context("open_with_options")?;
+
+    let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
+
+    if file.write(&metadata_bytes)? != metadata_bytes.len() {
+        bail!("Could not write all the metadata bytes in a single call");
+    }
+    file.sync_all()?;
+
+    // fsync the parent directory to ensure the directory entry is durable
+    if first_save {
+        let timeline_dir = File::open(
+            path.parent()
+                .expect("Metadata should always have a parent dir"),
+        )?;
+        timeline_dir.sync_all()?;
+    }
+
    Ok(())
 }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1,10 +1,9 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use rand::{distributions::Alphanumeric, Rng};
 use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::sync::Arc;
 use tokio::fs;

@@ -12,7 +11,6 @@ use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
 use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
@@ -20,16 +18,13 @@ use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::ControlPlaneClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

-use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
-use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
@@ -64,39 +59,6 @@ impl TenantsMap {
    }
 }

-/// This is "safe" in that that it won't leave behind a partially deleted directory
-/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
-/// the contents.
-///
-/// This is pageserver-specific, as it relies on future processes after a crash to check
-/// for TEMP_FILE_SUFFIX when loading things.
-async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
-    let tmp_path = safe_rename_tenant_dir(path).await?;
-    fs::remove_dir_all(tmp_path).await
-}
-
-async fn safe_rename_tenant_dir(path: impl AsRef<Path>) -> std::io::Result<PathBuf> {
-    let parent = path
-        .as_ref()
-        .parent()
-        // It is invalid to call this function with a relative path.  Tenant directories
-        // should always have a parent.
-        .ok_or(std::io::Error::new(
-            std::io::ErrorKind::InvalidInput,
-            "Path must be absolute",
-        ))?;
-    let rand_suffix = rand::thread_rng()
-        .sample_iter(&Alphanumeric)
-        .take(8)
-        .map(char::from)
-        .collect::<String>()
-        + TEMP_FILE_SUFFIX;
-    let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
-    fs::rename(&path, &tmp_path).await?;
-    fs::File::open(parent).await?.sync_all().await?;
-    Ok(tmp_path)
-}
-
 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));

 /// Initialize repositories with locally available timelines.
@@ -107,21 +69,12 @@ pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    resources: TenantSharedResources,
    init_order: InitializationOrder,
-    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();

    let mut tenants = HashMap::new();

-    // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-    let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
-        Some(client.re_attach().await?)
-    } else {
-        info!("Control plane API not configured, tenant generations are disabled");
-        None
-    };
-
    let mut dir_entries = fs::read_dir(&tenants_dir)
        .await
        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
@@ -138,8 +91,6 @@ pub async fn init_tenant_mgr(
                        "Found temporary tenant directory, removing: {}",
                        tenant_dir_path.display()
                    );
-                    // No need to use safe_remove_tenant_dir_all because this is already
-                    // a temporary path
                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
                        error!(
                            "Failed to remove temporary directory '{}': {:?}",
@@ -171,53 +122,9 @@ pub async fn init_tenant_mgr(
                        continue;
                    }

-                    let tenant_id = match tenant_dir_path
-                        .file_name()
-                        .and_then(OsStr::to_str)
-                        .unwrap_or_default()
-                        .parse::<TenantId>()
-                    {
-                        Ok(id) => id,
-                        Err(_) => {
-                            warn!(
-                                "Invalid tenant path (garbage in our repo directory?): {}",
-                                tenant_dir_path.display()
-                            );
-                            continue;
-                        }
-                    };
-
-                    let generation = if let Some(generations) = &tenant_generations {
-                        // We have a generation map: treat it as the authority for whether
-                        // this tenant is really attached.
-                        if let Some(gen) = generations.get(&tenant_id) {
-                            *gen
-                        } else {
-                            info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
-                            if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                                error!(
-                                    "Failed to remove detached tenant directory '{}': {:?}",
-                                    tenant_dir_path.display(),
-                                    e
-                                );
-                            }
-                            continue;
-                        }
-                    } else {
-                        // Legacy mode: no generation information, any tenant present
-                        // on local disk may activate
-                        info!(
-                            "Starting tenant {} in legacy mode, no generation",
-                            tenant_dir_path.display()
-                        );
-                        Generation::none()
-                    };
-
                    match schedule_local_tenant_processing(
                        conf,
-                        tenant_id,
                        &tenant_dir_path,
-                        generation,
                        resources.clone(),
                        Some(init_order.clone()),
                        &TENANTS,
@@ -251,12 +158,9 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-#[allow(clippy::too_many_arguments)]
 pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
-    tenant_id: TenantId,
    tenant_path: &Path,
-    generation: Generation,
    resources: TenantSharedResources,
    init_order: Option<InitializationOrder>,
    tenants: &'static tokio::sync::RwLock<TenantsMap>,
@@ -277,6 +181,15 @@ pub(crate) fn schedule_local_tenant_processing(
        "Cannot load tenant from empty directory {tenant_path:?}"
    );

+    let tenant_id = tenant_path
+        .file_name()
+        .and_then(OsStr::to_str)
+        .unwrap_or_default()
+        .parse::<TenantId>()
+        .with_context(|| {
+            format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
+        })?;
+
    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
    anyhow::ensure!(
        !conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
@@ -289,7 +202,6 @@ pub(crate) fn schedule_local_tenant_processing(
            match Tenant::spawn_attach(
                conf,
                tenant_id,
-                generation,
                resources.broker_client,
                tenants,
                remote_storage,
@@ -312,9 +224,7 @@ pub(crate) fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(
-            conf, tenant_id, generation, resources, init_order, tenants, ctx,
-        )
+        Tenant::spawn_load(conf, tenant_id, resources, init_order, tenants, ctx)
    };
    Ok(tenant)
 }
@@ -437,16 +347,15 @@ pub async fn create_tenant(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
-    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || async {
+    tenant_map_insert(tenant_id, || {
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

@@ -455,8 +364,7 @@ pub async fn create_tenant(
            remote_storage,
        };
        let created_tenant =
-            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                generation, tenant_resources, None, &TENANTS, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -486,8 +394,7 @@ pub async fn set_new_tenant_config(
    let tenant = get_tenant(tenant_id, true).await?;

    let tenant_config_path = conf.tenant_config_path(&tenant_id);
-    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
-        .await
+    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
        .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    Ok(())
@@ -503,8 +410,6 @@ pub enum GetTenantError {

 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-///
-/// This method is cancel-safe.
 pub async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
@@ -564,24 +469,7 @@ pub async fn detach_tenant(
    tenant_id: TenantId,
    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?;
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-    Ok(())
+    detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
 }

 async fn detach_tenant0(
@@ -589,16 +477,20 @@ async fn detach_tenant0(
    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    detach_ignored: bool,
-) -> Result<PathBuf, TenantStateError> {
-    let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
+) -> Result<(), TenantStateError> {
+    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        safe_rename_tenant_dir(&local_tenant_directory)
+        fs::remove_dir_all(&local_tenant_directory)
            .await
-            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
+            .with_context(|| {
+                format!("local tenant directory {local_tenant_directory:?} removal")
+            })?;
+        Ok(())
    };

    let removal_result =
-        remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;
+        remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
+            .await;

    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -606,10 +498,10 @@ async fn detach_tenant0(
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
        if tenant_ignore_mark.exists() {
            info!("Detaching an ignored tenant");
-            let tmp_path = tenant_dir_rename_operation(tenant_id)
+            local_files_cleanup_operation(tenant_id)
                .await
-                .with_context(|| format!("Ignored tenant {tenant_id} local directory rename"))?;
-            return Ok(tmp_path);
+                .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
+            return Ok(());
        }
    }

@@ -619,12 +511,11 @@ async fn detach_tenant0(
 pub async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
-    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || async {
+    tenant_map_insert(tenant_id, || {
        let tenant_path = conf.tenant_path(&tenant_id);
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
        if tenant_ignore_mark.exists() {
@@ -636,7 +527,7 @@ pub async fn load_tenant(
            broker_client,
            remote_storage,
        };
-        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path,  resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -700,14 +591,13 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
 pub async fn attach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
-    generation: Generation,
    tenant_conf: TenantConfOpt,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || async {
-        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
+    tenant_map_insert(tenant_id, || {
+        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

@@ -722,7 +612,7 @@ pub async fn attach_tenant(
            broker_client,
            remote_storage: Some(remote_storage),
        };
-        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -755,13 +645,12 @@ pub enum TenantMapInsertError {
 ///
 /// NB: the closure should return quickly because the current implementation of tenants map
 /// serializes access through an `RwLock`.
-async fn tenant_map_insert<F, R>(
+async fn tenant_map_insert<F>(
    tenant_id: TenantId,
    insert_fn: F,
 ) -> Result<Arc<Tenant>, TenantMapInsertError>
 where
-    F: FnOnce() -> R,
-    R: std::future::Future<Output = anyhow::Result<Arc<Tenant>>>,
+    F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
 {
    let mut guard = TENANTS.write().await;
    let m = match &mut *guard {
@@ -774,7 +663,7 @@ where
            tenant_id,
            e.get().current_state(),
        )),
-        hash_map::Entry::Vacant(v) => match insert_fn().await {
+        hash_map::Entry::Vacant(v) => match insert_fn() {
            Ok(tenant) => {
                v.insert(tenant.clone());
                Ok(tenant)
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -4,9 +4,10 @@ use std::{
    sync::atomic::{AtomicUsize, Ordering},
 };

+use crate::virtual_file::VirtualFile;
+
 fn fsync_path(path: &Path) -> io::Result<()> {
-    // TODO use VirtualFile::fsync_all once we fully go async.
-    let file = std::fs::File::open(path)?;
+    let file = VirtualFile::open(path)?;
    file.sync_all()
 }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -163,8 +163,6 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -216,7 +214,7 @@ use utils::backoff::{
 };

 use std::collections::{HashMap, VecDeque};
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

@@ -233,9 +231,9 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+pub(crate) use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
-use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -251,9 +249,8 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::LayerFileName;
+use super::storage_layer::{LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
-use super::Generation;

 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -317,7 +314,6 @@ pub struct RemoteTimelineClient {

    tenant_id: TenantId,
    timeline_id: TimelineId,
-    generation: Generation,

    upload_queue: Mutex<UploadQueue>,

@@ -338,19 +334,12 @@ impl RemoteTimelineClient {
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        generation: Generation,
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: if cfg!(test) {
-                // remote_timeline_client.rs tests rely on current-thread runtime
-                tokio::runtime::Handle::current()
-            } else {
-                BACKGROUND_RUNTIME.handle().clone()
-            },
+            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
            tenant_id,
            timeline_id,
-            generation,
            storage_impl: remote_storage,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
@@ -459,10 +448,10 @@ impl RemoteTimelineClient {
        );

        let index_part = download::download_index_part(
+            self.conf,
            &self.storage_impl,
            &self.tenant_id,
            &self.timeline_id,
-            self.generation,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -608,25 +597,25 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub fn schedule_layer_file_upload(
+    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer_file_name: &LayerFileName,
-        layer_metadata: &LayerFileMetadata,
+        layer: ResidentLayer,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

+        let metadata = LayerFileMetadata::new(layer.layer_desc().file_size);
+
        upload_queue
            .latest_files
-            .insert(layer_file_name.clone(), layer_metadata.clone());
+            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
+        info!("scheduled layer file upload {layer}");
+        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

-        info!("scheduled layer file upload {layer_file_name}");
-
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
        Ok(())
@@ -660,41 +649,22 @@ impl RemoteTimelineClient {
        // from latest_files, but not yet scheduled for deletion. Use a closure
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
-            // Decorate our list of names with each name's generation, dropping
-            // makes that are unexpectedly missing from our metadata.
-            let with_generations: Vec<_> = names
-                .iter()
-                .filter_map(|name| {
-                    // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(name);
-
-                    if let Some(meta) = meta {
-                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                        Some((name, meta.generation))
-                    } else {
-                        // This can only happen if we forgot to to schedule the file upload
-                        // before scheduling the delete. Log it because it is a rare/strange
-                        // situation, and in case something is misbehaving, we'd like to know which
-                        // layers experienced this.
-                        info!(
-                            "Deleting layer {name} not found in latest_files list, never uploaded?"
-                        );
-                        None
-                    }
-                })
-                .collect();
+            for name in names {
+                if upload_queue.latest_files.remove(name).is_some() {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                }
+            }

            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
                self.schedule_index_upload(upload_queue, metadata);
            }

            // schedule the actual deletions
-            for (name, generation) in with_generations {
+            for name in names {
                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: false,
-                    generation,
                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
@@ -790,10 +760,10 @@ impl RemoteTimelineClient {
        backoff::retry(
            || {
                upload::upload_index_part(
+                    self.conf,
                    &self.storage_impl,
                    &self.tenant_id,
                    &self.timeline_id,
-                    self.generation,
                    &index_part_with_deleted_at,
                )
            },
@@ -851,14 +821,12 @@ impl RemoteTimelineClient {
                .reserve(stopped.upload_queue_for_deletion.latest_files.len());

            // schedule the actual deletions
-            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
+            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: true,
-                    generation: meta.generation,
                });
-
                self.calls_unfinished_metric_begin(&op);
                stopped
                    .upload_queue_for_deletion
@@ -881,7 +849,8 @@ impl RemoteTimelineClient {

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
-        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
+        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
+        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;

        let remaining = backoff::retry(
            || async {
@@ -1084,18 +1053,13 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(layer_file_name.file_name());
-
+                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        &path,
+                        path,
                        layer_metadata,
-                        self.generation,
                    )
                    .measure_remote_op(
                        self.tenant_id,
@@ -1117,10 +1081,10 @@ impl RemoteTimelineClient {
                    };

                    let res = upload::upload_index_part(
+                        self.conf,
                        &self.storage_impl,
                        &self.tenant_id,
                        &self.timeline_id,
-                        self.generation,
                        index_part,
                    )
                    .measure_remote_op(
@@ -1145,7 +1109,7 @@ impl RemoteTimelineClient {
                        .conf
                        .timeline_path(&self.tenant_id, &self.timeline_id)
                        .join(delete.layer_file_name.file_name());
-                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
+                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
@@ -1392,95 +1356,6 @@ impl RemoteTimelineClient {
    }
 }

-pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
-    let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
-    RemotePath::from_string(&path).expect("Failed to construct path")
-}
-
-pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
-    remote_timelines_path(tenant_id).join(&PathBuf::from(timeline_id.to_string()))
-}
-
-pub fn remote_layer_path(
-    tenant_id: &TenantId,
-    timeline_id: &TimelineId,
-    layer_file_name: &LayerFileName,
-    layer_meta: &LayerFileMetadata,
-) -> RemotePath {
-    // Generation-aware key format
-    let path = format!(
-        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
-        layer_file_name.file_name(),
-        layer_meta.generation.get_suffix()
-    );
-
-    RemotePath::from_string(&path).expect("Failed to construct path")
-}
-
-pub fn remote_index_path(
-    tenant_id: &TenantId,
-    timeline_id: &TimelineId,
-    generation: Generation,
-) -> RemotePath {
-    RemotePath::from_string(&format!(
-        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
-        IndexPart::FILE_NAME,
-        generation.get_suffix()
-    ))
-    .expect("Failed to construct path")
-}
-
-/// Given the key of an index, parse out the generation part of the name
-pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
-    let file_name = match path.get_path().file_name() {
-        Some(f) => f,
-        None => {
-            // Unexpected: we should be seeing index_part.json paths only
-            tracing::warn!("Malformed index key {}", path);
-            return None;
-        }
-    };
-
-    let file_name_str = match file_name.to_str() {
-        Some(s) => s,
-        None => {
-            tracing::warn!("Malformed index key {:?}", path);
-            return None;
-        }
-    };
-    match file_name_str.split_once('-') {
-        Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix),
-        None => None,
-    }
-}
-
-/// Files on the remote storage are stored with paths, relative to the workdir.
-/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
-///
-/// Errors if the path provided does not start from pageserver's workdir.
-pub fn remote_path(
-    conf: &PageServerConf,
-    local_path: &Path,
-    generation: Generation,
-) -> anyhow::Result<RemotePath> {
-    let stripped = local_path
-        .strip_prefix(&conf.workdir)
-        .context("Failed to strip workdir prefix")?;
-
-    let suffixed = format!(
-        "{0}{1}",
-        stripped.to_string_lossy(),
-        generation.get_suffix()
-    );
-
-    RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
-        format!(
-            "to resolve remote part of path {:?} for base {:?}",
-            local_path, conf.workdir
-        )
-    })
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -1488,12 +1363,16 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Generation, Tenant, Timeline,
+            storage_layer::Layer,
+            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
-
-    use std::{collections::HashSet, path::Path};
+    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
+    use std::{
+        collections::HashSet,
+        path::{Path, PathBuf},
+    };
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1527,11 +1406,8 @@ mod tests {
        assert_eq!(avec, bvec);
    }

-    fn assert_remote_files(expected: &[&str], remote_path: &Path, generation: Generation) {
-        let mut expected: Vec<String> = expected
-            .iter()
-            .map(|x| format!("{}{}", x, generation.get_suffix()))
-            .collect();
+    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
+        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
        expected.sort();

        let mut found: Vec<String> = Vec::new();
@@ -1550,6 +1426,8 @@ mod tests {
        tenant: Arc<Tenant>,
        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
+        remote_fs_dir: PathBuf,
+        client: Arc<RemoteTimelineClient>,
    }

    impl TestSetup {
@@ -1559,44 +1437,51 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = harness.load().await;

+            // create an empty timeline directory
            let timeline = tenant
                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
                .await?;

+            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+            std::fs::create_dir_all(remote_fs_dir)?;
+            let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+
+            let storage_config = RemoteStorageConfig {
+                max_concurrent_syncs: std::num::NonZeroUsize::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                )
+                .unwrap(),
+                max_sync_errors: std::num::NonZeroU32::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                )
+                .unwrap(),
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            };
+
+            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+            let client = Arc::new(RemoteTimelineClient {
+                conf: harness.conf,
+                runtime: tokio::runtime::Handle::current(),
+                tenant_id: harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                storage_impl: storage,
+                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &harness.tenant_id,
+                    &TIMELINE_ID,
+                )),
+            });
+
            Ok(Self {
                harness,
                tenant,
                timeline,
                tenant_ctx: ctx,
+                remote_fs_dir,
+                client,
            })
        }
-
-        /// Construct a RemoteTimelineClient in an arbitrary generation
-        fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
-            Arc::new(RemoteTimelineClient {
-                conf: self.harness.conf,
-                runtime: tokio::runtime::Handle::current(),
-                tenant_id: self.harness.tenant_id,
-                timeline_id: TIMELINE_ID,
-                generation,
-                storage_impl: self.harness.remote_storage.clone(),
-                upload_queue: Mutex::new(UploadQueue::Uninitialized),
-                metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                    &self.harness.tenant_id,
-                    &TIMELINE_ID,
-                )),
-            })
-        }
-
-        /// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id
-        /// and timeline_id are present.
-        fn span(&self) -> tracing::Span {
-            tracing::info_span!(
-                "test",
-                tenant_id = %self.harness.tenant_id,
-                timeline_id = %TIMELINE_ID
-            )
-        }
    }

    // Test scheduling
@@ -1616,72 +1501,52 @@ mod tests {
        // Schedule another deletion. Check that it's launched immediately.
        // Schedule index upload. Check that it's queued

-        let test_setup = TestSetup::new("upload_scheduling").await.unwrap();
-        let span = test_setup.span();
-        let _guard = span.enter();
-
        let TestSetup {
            harness,
            tenant: _tenant,
            timeline,
            tenant_ctx: _tenant_ctx,
-        } = test_setup;
-
-        let client = timeline.remote_client.as_ref().unwrap();
-
-        // Download back the index.json, and check that the list of files is correct
-        let initial_index_part = match client.download_index_file().await.unwrap() {
-            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
-        };
-        let initial_layers = initial_index_part
-            .layer_metadata
-            .keys()
-            .map(|f| f.to_owned())
-            .collect::<HashSet<LayerFileName>>();
-        let initial_layer = {
-            assert!(initial_layers.len() == 1);
-            initial_layers.into_iter().next().unwrap()
-        };
+            remote_fs_dir,
+            client,
+        } = TestSetup::new("upload_scheduling").await.unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

-        let remote_timeline_dir = harness
-            .remote_fs_dir
-            .join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
+        let remote_timeline_dir =
+            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

-        let generation = harness.generation;
+        let metadata = dummy_metadata(Lsn(0x10));
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        // Create a couple of dummy files,  schedule upload for them
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
-        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
-        let content_1 = dummy_contents("foo");
-        let content_2 = dummy_contents("bar");
-        let content_3 = dummy_contents("baz");

-        for (filename, content) in [
-            (&layer_file_name_1, &content_1),
-            (&layer_file_name_2, &content_2),
-            (&layer_file_name_3, &content_3),
-        ] {
-            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
-        }
+        let layers = [
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
+        ]
+        .into_iter()
+        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
+
+            Layer::for_resident(
+                harness.conf,
+                &timeline,
+                name,
+                LayerFileMetadata::new(contents.len() as u64),
+            )
+        }).collect::<Vec<_>>();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[0].clone())
            .unwrap();
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_2,
-                &LayerFileMetadata::new(content_2.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[1].clone())
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1734,22 +1599,18 @@ mod tests {
                .map(|f| f.to_owned())
                .collect(),
            &[
-                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_3,
-                &LayerFileMetadata::new(content_3.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[2].clone())
            .unwrap();
        client
-            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
@@ -1764,13 +1625,11 @@ mod tests {
        }
        assert_remote_files(
            &[
-                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
-            generation,
        );

        // Finish them
@@ -1778,13 +1637,11 @@ mod tests {

        assert_remote_files(
            &[
-                &initial_layer.file_name(),
-                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
+                &layers[1].layer_desc().filename().file_name(),
+                &layers[2].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
-            generation,
        );
    }

@@ -1796,9 +1653,15 @@ mod tests {
            harness,
            tenant: _tenant,
            timeline,
+            client,
            ..
        } = TestSetup::new("metrics").await.unwrap();
-        let client = timeline.remote_client.as_ref().unwrap();
+
+        let metadata = dummy_metadata(Lsn(0x10));
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();
+
        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1809,20 +1672,18 @@ mod tests {
        )
        .unwrap();

-        #[derive(Debug, PartialEq, Clone, Copy)]
+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64),
+        );
+
+        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
            started: Option<usize>,
            finished: Option<usize>,
        }
-        impl std::ops::Add for BytesStartedFinished {
-            type Output = Self;
-            fn add(self, rhs: Self) -> Self::Output {
-                Self {
-                    started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
-                    finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
-                }
-            }
-        }
        let get_bytes_started_stopped = || {
            let started = client
                .metrics
@@ -1839,140 +1700,42 @@ mod tests {
        };

        // Test
-        tracing::info!("now doing actual test");

-        let actual_a = get_bytes_started_stopped();
+        let init = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
-            )
+            .schedule_layer_file_upload(layer_file_1.clone())
            .unwrap();

-        let actual_b = get_bytes_started_stopped();
+        let pre = get_bytes_started_stopped();

        client.wait_completion().await.unwrap();

-        let actual_c = get_bytes_started_stopped();
+        let post = get_bytes_started_stopped();

        // Validate

-        let expected_b = actual_a
-            + BytesStartedFinished {
+        assert_eq!(
+            init,
+            BytesStartedFinished {
+                started: None,
+                finished: None
+            }
+        );
+        assert_eq!(
+            pre,
+            BytesStartedFinished {
                started: Some(content_1.len()),
                // assert that the _finished metric is created eagerly so that subtractions work on first sample
                finished: Some(0),
-            };
-        assert_eq!(actual_b, expected_b);
-
-        let expected_c = actual_a
-            + BytesStartedFinished {
-                started: Some(content_1.len()),
-                finished: Some(content_1.len()),
-            };
-        assert_eq!(actual_c, expected_c);
-    }
-
-    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
-        // An empty IndexPart, just sufficient to ensure deserialization will succeed
-        let example_metadata = TimelineMetadata::example();
-        let example_index_part = IndexPart::new(
-            HashMap::new(),
-            example_metadata.disk_consistent_lsn(),
-            example_metadata,
-        );
-
-        let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
-
-        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
-        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
-            timeline_path
-                .strip_prefix(&test_state.harness.conf.workdir)
-                .unwrap(),
-        );
-
-        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
-
-        let index_path = test_state.harness.remote_fs_dir.join(
-            remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
-        );
-        eprintln!("Writing {}", index_path.display());
-        std::fs::write(&index_path, index_part_bytes).unwrap();
-        example_index_part
-    }
-
-    /// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its
-    /// index, the IndexPart returned is equal to `expected`
-    async fn assert_got_index_part(
-        test_state: &TestSetup,
-        get_generation: Generation,
-        expected: &IndexPart,
-    ) {
-        let client = test_state.build_client(get_generation);
-
-        let download_r = client
-            .download_index_file()
-            .await
-            .expect("download should always succeed");
-        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
-        match download_r {
-            MaybeDeletedIndexPart::IndexPart(index_part) => {
-                assert_eq!(&index_part, expected);
            }
-            MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"),
-        }
-    }
-
-    #[tokio::test]
-    async fn index_part_download_simple() -> anyhow::Result<()> {
-        let test_state = TestSetup::new("index_part_download_simple").await.unwrap();
-        let span = test_state.span();
-        let _guard = span.enter();
-
-        // Simple case: we are in generation N, load the index from generation N - 1
-        let generation_n = 5;
-        let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
-
-        assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await;
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn index_part_download_ordering() -> anyhow::Result<()> {
-        let test_state = TestSetup::new("index_part_download_ordering")
-            .await
-            .unwrap();
-
-        let span = test_state.span();
-        let _guard = span.enter();
-
-        // A generation-less IndexPart exists in the bucket, we should find it
-        let generation_n = 5;
-        let injected_none = inject_index_part(&test_state, Generation::none()).await;
-        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await;
-
-        // If a more recent-than-none generation exists, we should prefer to load that
-        let injected_1 = inject_index_part(&test_state, Generation::new(1)).await;
-        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
-
-        // If a more-recent-than-me generation exists, we should ignore it.
-        let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await;
-        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
-
-        // If a directly previous generation exists, _and_ an index exists in my own
-        // generation, I should prefer my own generation.
-        let _injected_prev =
-            inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
-        let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await;
-        assert_got_index_part(
-            &test_state,
-            Generation::new(generation_n),
-            &injected_current,
-        )
-        .await;
-
-        Ok(())
+        );
+        assert_eq!(
+            post,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len())
+            }
+        );
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ b/pageserver/src/tenant/remote_timeline_client/delete.rs
@@ -5,30 +5,25 @@ use tracing::debug;

 use remote_storage::GenericRemoteStorage;

-use crate::{
-    config::PageServerConf,
-    tenant::{remote_timeline_client::remote_path, Generation},
-};
+use crate::config::PageServerConf;

 pub(super) async fn delete_layer<'a>(
    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
    local_layer_path: &'a Path,
-    generation: Generation,
 ) -> anyhow::Result<()> {
    fail::fail_point!("before-delete-layer", |_| {
        anyhow::bail!("failpoint before-delete-layer")
    });
    debug!("Deleting layer from remote storage: {local_layer_path:?}",);

-    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
+    let path_to_delete = conf.remote_path(local_layer_path)?;

    // We don't want to print an error if the delete failed if the file has
    // already been deleted. Thankfully, in this situation S3 already
    // does not yield an error. While OS-provided local file system APIs do yield
    // errors, we avoid them in the `LocalFs` wrapper.
-    storage
-        .delete(&path_to_delete)
-        .await
-        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
+    storage.delete(&path_to_delete).await.with_context(|| {
+        format!("Failed to delete remote layer from storage at {path_to_delete:?}")
+    })
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -15,19 +15,14 @@ use tokio_util::sync::CancellationToken;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
-use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::Generation;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{
-    parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES,
-};
+use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -46,11 +41,13 @@ pub async fn download_layer_file<'a>(
 ) -> Result<u64, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

-    let local_path = conf
-        .timeline_path(&tenant_id, &timeline_id)
-        .join(layer_file_name.file_name());
+    let timeline_path = conf.timeline_path(&tenant_id, &timeline_id);

-    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
+    let local_path = timeline_path.join(layer_file_name.file_name());
+
+    let remote_path = conf
+        .remote_path(&local_path)
+        .map_err(DownloadError::Other)?;

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
@@ -67,43 +64,33 @@ pub async fn download_layer_file<'a>(
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            // TODO: this doesn't use the cached fd for some reason?
-            let mut destination_file = fs::File::create(&temp_file_path)
-                .await
-                .with_context(|| {
-                    format!(
-                        "create a destination file for layer '{}'",
-                        temp_file_path.display()
-                    )
-                })
-                .map_err(DownloadError::Other)?;
-            let mut download = storage
-                .download(&remote_path)
-                .await
-                .with_context(|| {
-                    format!(
-                    "open a download stream for layer with remote storage path '{remote_path:?}'"
-                )
-                })
-                .map_err(DownloadError::Other)?;
-
-            let bytes_amount = tokio::time::timeout(
-                MAX_DOWNLOAD_DURATION,
-                tokio::io::copy(&mut download.download_stream, &mut destination_file),
-            )
-            .await
-            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
-            .with_context(|| {
+            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
                format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
+                    "create a destination file for layer '{}'",
+                    temp_file_path.display()
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let mut download = storage.download(&remote_path).await.with_context(|| {
+                format!(
+                    "open a download stream for layer with remote storage path '{remote_path:?}'"
                )
            })
            .map_err(DownloadError::Other)?;

+            let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
+                .await
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+                .with_context(|| {
+                    format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+                })
+                .map_err(DownloadError::Other)?;
+
            Ok((destination_file, bytes_amount))
+
        },
        &format!("download {remote_path:?}"),
-    )
-    .await?;
+    ).await?;

    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
    // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -116,7 +103,12 @@ pub async fn download_layer_file<'a>(
    destination_file
        .flush()
        .await
-        .with_context(|| format!("flush source file at {}", temp_file_path.display()))
+        .with_context(|| {
+            format!(
+                "failed to flush source file at {}",
+                temp_file_path.display()
+            )
+        })
        .map_err(DownloadError::Other)?;

    let expected = layer_metadata.file_size();
@@ -147,12 +139,17 @@ pub async fn download_layer_file<'a>(

    fs::rename(&temp_file_path, &local_path)
        .await
-        .with_context(|| format!("rename download layer file to {}", local_path.display(),))
+        .with_context(|| {
+            format!(
+                "Could not rename download layer file to {}",
+                local_path.display(),
+            )
+        })
        .map_err(DownloadError::Other)?;

    crashsafe::fsync_async(&local_path)
        .await
-        .with_context(|| format!("fsync layer file {}", local_path.display(),))
+        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

    tracing::debug!("download complete: {}", local_path.display());
@@ -176,19 +173,21 @@ pub fn is_temp_download_file(path: &Path) -> bool {
 }

 /// List timelines of given tenant in remote storage
-pub async fn list_remote_timelines(
-    storage: &GenericRemoteStorage,
+pub async fn list_remote_timelines<'a>(
+    storage: &'a GenericRemoteStorage,
+    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> anyhow::Result<HashSet<TimelineId>> {
-    let remote_path = remote_timelines_path(&tenant_id);
+    let tenant_path = conf.timelines_path(&tenant_id);
+    let tenant_storage_path = conf.remote_path(&tenant_path)?;

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

    let timelines = download_retry(
-        || storage.list_prefixes(Some(&remote_path)),
-        &format!("list prefixes for {tenant_id}"),
+        || storage.list_prefixes(Some(&tenant_storage_path)),
+        &format!("list prefixes for {tenant_path:?}"),
    )
    .await?;

@@ -203,9 +202,9 @@ pub async fn list_remote_timelines(
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        let timeline_id: TimelineId = object_name
-            .parse()
-            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
+        let timeline_id: TimelineId = object_name.parse().with_context(|| {
+            format!("failed to parse object name into timeline id '{object_name}'")
+        })?;

        // list_prefixes is assumed to return unique names. Ensure this here.
        // NB: it's safer to bail out than warn-log this because the pageserver
@@ -222,17 +221,22 @@ pub async fn list_remote_timelines(
    Ok(timeline_ids)
 }

-async fn do_download_index_part(
+pub(super) async fn download_index_part(
+    conf: &'static PageServerConf,
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-    index_generation: Generation,
 ) -> Result<IndexPart, DownloadError> {
-    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
+    let index_part_path = conf
+        .metadata_path(tenant_id, timeline_id)
+        .with_file_name(IndexPart::FILE_NAME);
+    let part_storage_path = conf
+        .remote_path(&index_part_path)
+        .map_err(DownloadError::BadInput)?;

    let index_part_bytes = download_retry(
        || async {
-            let mut index_part_download = storage.download(&remote_path).await?;
+            let mut index_part_download = storage.download(&part_storage_path).await?;

            let mut index_part_bytes = Vec::new();
            tokio::io::copy(
@@ -240,120 +244,25 @@ async fn do_download_index_part(
                &mut index_part_bytes,
            )
            .await
-            .with_context(|| format!("download index part at {remote_path:?}"))
+            .with_context(|| {
+                format!("Failed to download an index part into file {index_part_path:?}")
+            })
            .map_err(DownloadError::Other)?;
            Ok(index_part_bytes)
        },
-        &format!("download {remote_path:?}"),
+        &format!("download {part_storage_path:?}"),
    )
    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("download index part file at {remote_path:?}"))
+        .with_context(|| {
+            format!("Failed to deserialize index part file into file {index_part_path:?}")
+        })
        .map_err(DownloadError::Other)?;

    Ok(index_part)
 }

-/// index_part.json objects are suffixed with a generation number, so we cannot
-/// directly GET the latest index part without doing some probing.
-///
-/// In this function we probe for the most recent index in a generation <= our current generation.
-/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
-#[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(super) async fn download_index_part(
-    storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
-    timeline_id: &TimelineId,
-    my_generation: Generation,
-) -> Result<IndexPart, DownloadError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-
-    if my_generation.is_none() {
-        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
-    }
-
-    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
-    // index in our generation.
-    //
-    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
-    match res {
-        Ok(index_part) => {
-            tracing::debug!(
-                "Found index_part from current generation (this is a stale attachment)"
-            );
-            return Ok(index_part);
-        }
-        Err(DownloadError::NotFound) => {}
-        Err(e) => return Err(e),
-    };
-
-    // Typical case: the previous generation of this tenant was running healthily, and had uploaded
-    // and index part.  We may safely start from this index without doing a listing, because:
-    //  - We checked for current generation case above
-    //  - generations > my_generation are to be ignored
-    //  - any other indices that exist would have an older generation than `previous_gen`, and
-    //    we want to find the most recent index from a previous generation.
-    //
-    // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
-    match res {
-        Ok(index_part) => {
-            tracing::debug!("Found index_part from previous generation");
-            return Ok(index_part);
-        }
-        Err(DownloadError::NotFound) => {
-            tracing::debug!(
-                "No index_part found from previous generation, falling back to listing"
-            );
-        }
-        Err(e) => {
-            return Err(e);
-        }
-    }
-
-    // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
-    // objects, and select the highest one with a generation <= my_generation.
-    let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
-    let indices = backoff::retry(
-        || async { storage.list_files(Some(&index_prefix)).await },
-        |_| false,
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "listing index_part files",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
-            unreachable!()
-        }),
-    )
-    .await
-    .map_err(DownloadError::Other)?;
-
-    // General case logic for which index to use: the latest index whose generation
-    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
-    let max_previous_generation = indices
-        .into_iter()
-        .filter_map(parse_remote_index_path)
-        .filter(|g| g <= &my_generation)
-        .max();
-
-    match max_previous_generation {
-        Some(g) => {
-            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g).await
-        }
-        None => {
-            // Migration from legacy pre-generation state: we have a generation but no prior
-            // attached pageservers did.  Try to load from a no-generation path.
-            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
-        }
-    }
-}
-
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -2,7 +2,7 @@
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
@@ -12,7 +12,6 @@ use utils::bin_ser::SerializeError;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
-use crate::tenant::Generation;

 use utils::lsn::Lsn;

@@ -21,28 +20,22 @@ use utils::lsn::Lsn;
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
 /// might have less or more metadata depending if upgrading or rolling back an upgrade.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
-//#[cfg_attr(test, derive(Default))]
+#[cfg_attr(test, derive(Default))]
 pub struct LayerFileMetadata {
    file_size: u64,
-
-    pub(crate) generation: Generation,
 }

 impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
    fn from(other: &IndexLayerMetadata) -> Self {
        LayerFileMetadata {
            file_size: other.file_size,
-            generation: other.generation,
        }
    }
 }

 impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-        }
+    pub fn new(file_size: u64) -> Self {
+        LayerFileMetadata { file_size }
    }

    pub fn file_size(&self) -> u64 {
@@ -69,6 +62,10 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

+    /// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
+    #[serde(default, skip_deserializing)]
+    timeline_layers: HashSet<LayerFileName>,
+
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -94,12 +91,7 @@ impl IndexPart {
    /// - 2: added `deleted_at`
    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
    ///      is always generated from the keys of `layer_metadata`)
-    /// - 4: timeline_layers is fully removed.
-    const LATEST_VERSION: usize = 4;
-
-    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];
-
+    const LATEST_VERSION: usize = 3;
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
@@ -107,30 +99,24 @@ impl IndexPart {
        disk_consistent_lsn: Lsn,
        metadata: TimelineMetadata,
    ) -> Self {
-        // Transform LayerFileMetadata into IndexLayerMetadata
-        let layer_metadata = layers_and_metadata
-            .into_iter()
-            .map(|(k, v)| (k, IndexLayerMetadata::from(v)))
-            .collect();
+        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
+        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
+
+        for (remote_name, metadata) in &layers_and_metadata {
+            timeline_layers.insert(remote_name.to_owned());
+            let metadata = IndexLayerMetadata::from(metadata);
+            layer_metadata.insert(remote_name.to_owned(), metadata);
+        }

        Self {
            version: Self::LATEST_VERSION,
+            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
            metadata,
            deleted_at: None,
        }
    }
-
-    pub fn get_version(&self) -> usize {
-        self.version
-    }
-
-    /// If you want this under normal operations, read it from self.metadata:
-    /// this method is just for the scrubber to use when validating an index.
-    pub fn get_disk_consistent_lsn(&self) -> Lsn {
-        self.disk_consistent_lsn
-    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -149,20 +135,15 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {
 }

 /// Serialized form of [`LayerFileMetadata`].
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
-    pub file_size: u64,
-
-    #[serde(default = "Generation::none")]
-    #[serde(skip_serializing_if = "Generation::is_none")]
-    pub(super) generation: Generation,
+    pub(super) file_size: u64,
 }

-impl From<LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: LayerFileMetadata) -> Self {
+impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &'_ LayerFileMetadata) -> Self {
        IndexLayerMetadata {
            file_size: other.file_size,
-            generation: other.generation,
        }
    }
 }
@@ -187,16 +168,15 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -225,16 +205,15 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -264,16 +243,15 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -298,6 +276,7 @@ mod tests {

        let expected = IndexPart {
            version: 1,
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[
@@ -330,41 +309,4 @@ mod tests {

        assert_eq!(empty_layers_parsed, expected);
    }
-
-    #[test]
-    fn v4_indexpart_is_parsed() {
-        let example = r#"{
-            "version":4,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }"#;
-
-        let expected = IndexPart {
-            version: 4,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: 9007199254741001,
-                    generation: Generation::none()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
-        };
-
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
-        assert_eq!(part, expected);
-    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,11 +5,7 @@ use fail::fail_point;
 use std::{io::ErrorKind, path::Path};
 use tokio::fs;

-use super::Generation;
-use crate::{
-    config::PageServerConf,
-    tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
-};
+use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};

@@ -19,10 +15,10 @@ use tracing::info;

 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
+    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-    generation: Generation,
    index_part: &'a IndexPart,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading new index part");
@@ -31,16 +27,20 @@ pub(super) async fn upload_index_part<'a>(
        bail!("failpoint before-upload-index")
    });

-    let index_part_bytes =
-        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
+    let index_part_bytes = serde_json::to_vec(&index_part)
+        .context("Failed to serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));

-    let remote_path = remote_index_path(tenant_id, timeline_id, generation);
+    let index_part_path = conf
+        .metadata_path(tenant_id, timeline_id)
+        .with_file_name(IndexPart::FILE_NAME);
+    let storage_path = conf.remote_path(&index_part_path)?;
+
    storage
-        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
+        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
        .await
-        .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
+        .with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -52,13 +52,12 @@ pub(super) async fn upload_timeline_layer<'a>(
    storage: &'a GenericRemoteStorage,
    source_path: &'a Path,
    known_metadata: &'a LayerFileMetadata,
-    generation: Generation,
 ) -> anyhow::Result<()> {
    fail_point!("before-upload-layer", |_| {
        bail!("failpoint before-upload-layer")
    });
+    let storage_path = conf.remote_path(source_path)?;

-    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
        Ok(source_file) => source_file,
@@ -68,18 +67,21 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
+            //
+            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
-        Err(e) => {
-            Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
-        }
+        Err(e) => Err(e)
+            .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
    };

    let fs_size = source_file
        .metadata()
        .await
-        .with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
+        .with_context(|| {
+            format!("Failed to get the source file metadata for layer {source_path:?}")
+        })?
        .len();

    let metadata_size = known_metadata.file_size();
@@ -87,13 +89,19 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
    }

-    let fs_size = usize::try_from(fs_size)
-        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
+    let fs_size = usize::try_from(fs_size).with_context(|| {
+        format!("File {source_path:?} size {fs_size} could not be converted to usize")
+    })?;

    storage
        .upload(source_file, fs_size, &storage_path, None)
        .await
-        .with_context(|| format!("upload layer from local path '{}'", source_path.display()))?;
+        .with_context(|| {
+            format!(
+                "Failed to upload a layer from local path '{}'",
+                source_path.display()
+            )
+        })?;

    Ok(())
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,26 +4,21 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer;
 mod layer_desc;
-mod remote_layer;

-use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::path::PathBuf;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -39,7 +34,8 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-pub use remote_layer::RemoteLayer;
+
+pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -74,7 +70,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from Layer::get_page_reconstruct_data
+/// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -179,26 +175,6 @@ impl LayerAccessStats {
        new
    }

-    /// Creates a clone of `self` and records `new_status` in the clone.
-    ///
-    /// The `new_status` is not recorded in `self`.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn clone_for_residence_change(
-        &self,
-        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
-        let clone = {
-            let inner = self.0.lock().unwrap();
-            inner.clone()
-        };
-        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
-        new
-    }
-
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -321,95 +297,12 @@ impl LayerAccessStats {
    }
 }

-/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
-///
-/// All layers should implement a minimal `std::fmt::Debug` without tenant or
-/// timeline names, because those are known in the context of which the layers
-/// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
-    ///
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// See PageReconstructResult for possible return values. The collected data
-    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call, or a struct with a cached older image of the page if one
-    /// is available. If this returns ValueReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data' to
-    /// collect more data.
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult>;
-}
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

-/// A Layer contains all data in a "rectangle" consisting of a range of keys and
-/// range of LSNs.
-///
-/// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access to the
-/// recent page versions. On-disk layers are stored as files on disk, and are
-/// immutable. This trait presents the common functionality of in-memory and
-/// on-disk layers.
-///
-/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
-/// A delta layer contains all modifications within a range of LSNs and keys.
-/// An image layer is a snapshot of all the data in a key-range, at a single
-/// LSN.
-pub trait PersistentLayer: Layer + AsLayerDesc {
-    /// File name used for this layer, both in the pageserver's local filesystem
-    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName {
-        self.layer_desc().filename()
-    }
-
-    // Path to the layer file in the local filesystem.
-    // `None` for `RemoteLayer`.
-    fn local_path(&self) -> Option<PathBuf>;
-
-    /// Permanently remove this layer from disk.
-    fn delete_resident_layer_file(&self) -> Result<()>;
-
-    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        None
-    }
-
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        false
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
-
-    fn access_stats(&self) -> &LayerAccessStats;
-}
-
-pub fn downcast_remote_layer(
-    layer: &Arc<dyn PersistentLayer>,
-) -> Option<std::sync::Arc<RemoteLayer>> {
-    if layer.is_remote_layer() {
-        Arc::clone(layer).downcast_remote_layer()
-    } else {
-        None
-    }
-}
-
 pub mod tests {
    use super::*;

@@ -447,19 +340,6 @@ pub mod tests {
    }
 }

-/// Helper enum to hold a PageServerConf, or a path
-///
-/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
-/// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
-/// struct for a file on disk, without having a page server running, so that we have no
-/// config. In that case, we use the Path variant to hold the full path to the file on
-/// disk.
-enum PathOrConf {
-    Path(PathBuf),
-    Conf(&'static PageServerConf),
-}
-
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,21 +31,21 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::BlobWriter;
+use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{
-    PersistentLayer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
-use std::io::SeekFrom;
+use std::fs::File;
+use std::io::{BufWriter, Write};
+use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -59,10 +59,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
-};
+use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -182,20 +179,12 @@ impl DeltaKey {
    }
 }

-/// DeltaLayer is the in-memory data structure associated with an on-disk delta
-/// file.
-///
-/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
 pub struct DeltaLayer {
-    path_or_conf: PathOrConf,
-
+    path: PathBuf,
    pub desc: PersistentLayerDesc,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -212,19 +201,15 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

+/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
+/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
-}
-
-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
+    file: FileBlockReader<VirtualFile>,
 }

 impl std::fmt::Debug for DeltaLayerInner {
@@ -236,19 +221,6 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for DeltaLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -262,40 +234,9 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

-impl PersistentLayer for DeltaLayer {
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        Some(self)
-    }
-
-    fn local_path(&self) -> Option<PathBuf> {
-        self.local_path()
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        self.delete_resident_layer_file()
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.info(reset)
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        self.access_stats()
-    }
-}
-
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
-        );
+        self.desc.dump();

        if !verbose {
            return Ok(());
@@ -303,119 +244,7 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        println!(
-            "index_start_blk: {}, root {}",
-            inner.index_start_blk, inner.index_root_blk
-        );
-
-        let file = &inner.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = DeltaLayerInner::load_keys(&inner).await?;
-
-        // A subroutine to dump a single blob
-        async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    let err: anyhow::Error = err;
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.desc.lsn_range.start);
-
-        ensure!(self.desc.key_range.contains(&key));
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
-            .await
-    }
-
-    pub(crate) fn local_path(&self) -> Option<PathBuf> {
-        Some(self.path())
-    }
-
-    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_range = self.layer_desc().lsn_range.clone();
-
-        let access_stats = self.access_stats.as_api_model(reset);
-
-        HistoricLayerInfo::Delta {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start: lsn_range.start,
-            lsn_end: lsn_range.end,
-            remote: false,
-            access_stats,
-        }
-    }
-
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
-        fname: &DeltaFileName,
-    ) -> PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.clone(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(tenant_id, timeline_id)
-                .join(fname.to_string()),
-        }
+        inner.dump().await
    }

    fn temp_path_for(
@@ -461,52 +290,22 @@ impl DeltaLayer {
    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = DeltaLayerInner::load(&path, None)?;

-        let loaded = DeltaLayerInner::load(&path, summary).await?;
+        // not production code

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
+        let actual_filename = self.path.file_name().unwrap().to_str().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(Arc::new(loaded))
    }

-    /// Create a DeltaLayer struct representing an existing file on disk.
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &DeltaFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> DeltaLayer {
-        DeltaLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_delta(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn_range.clone(),
-                file_size,
-            ),
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -521,7 +320,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -534,29 +333,9 @@ impl DeltaLayer {
        })
    }

-    fn layer_name(&self) -> DeltaFileName {
-        self.desc.delta_file_name()
-    }
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            &self.desc.tenant_id,
-            &self.desc.timeline_id,
-            &self.layer_name(),
-        )
-    }
-    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .await
-            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
-            .await
-            .context("Layer index is corrupted")
+    /// Path to the layer file
+    fn path(&self) -> PathBuf {
+        self.path.clone()
    }
 }

@@ -582,14 +361,14 @@ struct DeltaLayerWriterInner {

    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

-    blob_writer: BlobWriter<true>,
+    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

 impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
-    async fn new(
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -604,10 +383,11 @@ impl DeltaLayerWriterInner {
        // FIXME: throw an error instead?
        let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);

-        let mut file = VirtualFile::create(&path).await?;
+        let mut file = VirtualFile::create(&path)?;
        // make room for the header block
-        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
+        let buf_writer = BufWriter::new(file);
+        let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -630,12 +410,11 @@ impl DeltaLayerWriterInner {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
-            .await
    }

-    async fn put_value_bytes(
+    fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -644,7 +423,7 @@ impl DeltaLayerWriterInner {
    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);

-        let off = self.blob_writer.write_blob(val).await?;
+        let off = self.blob_writer.write_blob(val)?;

        let blob_ref = BlobRef::new(off, will_init);

@@ -661,18 +440,18 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

-        let mut file = self.blob_writer.into_inner().await?;
+        let buf_writer = self.blob_writer.into_inner();
+        let mut file = buf_writer.into_inner()?;

        // Write out the index
        let (index_root_blk, block_buf) = self.tree.finish()?;
-        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
-            .await?;
+        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
        for buf in block_buf.blocks {
-            file.write_all(buf.as_ref()).await?;
+            file.write_all(buf.as_ref())?;
        }
        assert!(self.lsn_range.start < self.lsn_range.end);
        // Fill in the summary on blk 0
@@ -686,22 +465,11 @@ impl DeltaLayerWriterInner {
            index_start_blk,
            index_root_blk,
        };
-
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
-        Summary::ser_into(&summary, &mut buf)?;
-        if buf.spilled() {
-            // This is bad as we only have one free block for the summary
-            warn!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            );
-        }
-        file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        file.seek(SeekFrom::Start(0))?;
+        Summary::ser_into(&summary, &mut file)?;

        let metadata = file
            .metadata()
-            .await
            .context("get file metadata to determine size")?;

        // 5GB limit for objects without multipart upload (which we don't want to use)
@@ -718,37 +486,21 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = DeltaLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc: PersistentLayerDesc::new_delta(
-                self.tenant_id,
-                self.timeline_id,
-                self.key_start..key_end,
-                self.lsn_range.clone(),
-                metadata.len(),
-            ),
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };
+
+        let desc = PersistentLayerDesc::new_delta(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_start..key_end,
+            self.lsn_range.clone(),
+            metadata.len(),
+        );

        // fsync the file
-        file.sync_all().await?;
-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = DeltaLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            &self.tenant_id,
-            &self.timeline_id,
-            &DeltaFileName {
-                key_range: self.key_start..key_end,
-                lsn_range: self.lsn_range,
-            },
-        );
-        std::fs::rename(self.path, &final_path)?;
+        file.sync_all()?;

-        trace!("created delta layer {}", final_path.display());
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+
+        trace!("created delta layer {}", layer.local_path().display());

        Ok(layer)
    }
@@ -784,7 +536,7 @@ impl DeltaLayerWriter {
    ///
    /// Start building a new delta layer.
    ///
-    pub async fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -792,10 +544,13 @@ impl DeltaLayerWriter {
        lsn_range: Range<Lsn>,
    ) -> anyhow::Result<Self> {
        Ok(Self {
-            inner: Some(
-                DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
-                    .await?,
-            ),
+            inner: Some(DeltaLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_start,
+                lsn_range,
+            )?),
        })
    }

@@ -804,11 +559,11 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val).await
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val)
    }

-    pub async fn put_value_bytes(
+    pub fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -819,7 +574,6 @@ impl DeltaLayerWriter {
            .as_mut()
            .unwrap()
            .put_value_bytes(key, lsn, val, will_init)
-            .await
    }

    pub fn size(&self) -> u64 {
@@ -829,33 +583,36 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end).await
+    pub(crate) fn finish(
+        mut self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner.take().unwrap().finish(key_end, timeline)
    }
 }

 impl Drop for DeltaLayerWriter {
    fn drop(&mut self) {
        if let Some(inner) = self.inner.take() {
-            // We want to remove the virtual file here, so it's fine to not
-            // having completely flushed unwritten data.
-            let vfile = inner.blob_writer.into_inner_no_flush();
-            vfile.remove();
+            match inner.blob_writer.into_inner().into_inner() {
+                Ok(vfile) => vfile.remove(),
+                Err(err) => warn!(
+                    "error while flushing buffer of image layer temporary file: {}",
+                    err
+                ),
+            }
        }
    }
 }

 impl DeltaLayerInner {
-    pub(super) async fn load(
-        path: &std::path::Path,
-        summary: Option<Summary>,
-    ) -> anyhow::Result<Self> {
+    pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
-            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0).await?;
+        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -958,14 +715,14 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
-        this: &T,
-    ) -> Result<Vec<DeltaEntry<'_>>> {
-        let dl = this.as_ref();
-        let file = &dl.file;
+    pub(super) async fn load_keys(&self) -> Result<Vec<DeltaEntry<'_>>> {
+        let file = &self.file;

-        let tree_reader =
-            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -978,7 +735,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(dl),
+                            Adapter(self),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -1002,10 +759,61 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
+
+    pub(super) async fn dump(&self) -> anyhow::Result<()> {
+        println!(
+            "index_start_blk: {}, root {}",
+            self.index_start_blk, self.index_root_blk
+        );
+
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = self.load_keys().await?;
+
+        async fn dump_blob(val: ValueRef<'_>) -> anyhow::Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1037,7 +845,13 @@ impl<'a> ValueRef<'a> {
 pub(crate) struct Adapter<T>(T);

 impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum).await
+    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.0.as_ref().file.read_blk(blknum)
+    }
+}
+
+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
    }
 }
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -212,7 +212,7 @@ pub enum LayerFileName {
 }

 impl LayerFileName {
-    pub fn file_name(&self) -> String {
+    pub(crate) fn file_name(&self) -> String {
        self.to_string()
    }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Joonas Koivunen	396e6a4b62	test: rename previous test, cleanup, still does not work	2023-08-30 14:19:54 +03:00
Joonas Koivunen	3295d6245a	fix: provide better context for the other test	2023-08-30 14:19:54 +03:00
Joonas Koivunen	a4bfdd8013	test: actually duplicate L1 layer in test	2023-08-30 14:19:54 +03:00
Joonas Koivunen	cd869e5737	handle duplicate l1s almost safely there is still the possibility of the old layer not really being readable or returning bogus values if the internal representation has shifted. cannot do anything for that... unless we rename them away, or something like that.	2023-08-30 14:19:54 +03:00
Joonas Koivunen	544136f13f	fix: require ResidentLayer to keep resident it only matters when calling this, it can get "evicted" after, even though, it's file will probably not be readable.	2023-08-30 10:31:56 +03:00
Joonas Koivunen	1cbb7fccaa	allow keeping a duplicate resident except that the residentness is not enforced right now.	2023-08-30 10:02:08 +03:00
Joonas Koivunen	9ab35cc5d2	fix: upload compacted layers	2023-08-30 09:45:37 +03:00
Joonas Koivunen	bb03c74217	refactor: rename for_written_tempfile -> finish_creating	2023-08-29 15:57:34 +03:00
Joonas Koivunen	774f34d778	refactor: move all metrics updates to layer this fixes some missing increments for num_persistent_files_created, persistent_bytes_written and removes double entries for residence events.	2023-08-29 15:45:03 +03:00
Joonas Koivunen	f5a171076b	doc: comment about chance of both evictions selecting same layer	2023-08-29 15:20:25 +03:00
Joonas Koivunen	881fdbc04c	doc: fix broken link	2023-08-29 14:34:06 +03:00
Joonas Koivunen	8a27e58894	doc: fix outdated commit	2023-08-29 14:19:53 +03:00
Joonas Koivunen	b4c81f7dff	refactor: rename LayerInner::on_drop to on_downloaded_layer_drop	2023-08-29 14:13:09 +03:00
Joonas Koivunen	9151b71b19	doc: fix link	2023-08-29 13:25:36 +03:00
Joonas Koivunen	8348bc9b1a	test: allow witnessing stopping before broken	2023-08-29 12:24:39 +03:00
Joonas Koivunen	6e47438f43	fixup residentlayer comment	2023-08-29 12:16:45 +03:00
Joonas Koivunen	579e85e92d	doc: assert &Arc<LayerInner> and DownloadedLayer::owner	2023-08-29 12:16:27 +03:00
Joonas Koivunen	53d2b48ea2	doc: adjust more LayerInner::on_drop	2023-08-29 12:15:46 +03:00
Joonas Koivunen	ac6604b6ed	doc: adjust while in queue	2023-08-29 12:06:47 +03:00
Joonas Koivunen	91b64427ed	doc: remove comment about backoff	2023-08-29 12:04:34 +03:00
Joonas Koivunen	ffe0f90083	doc: when => while	2023-08-29 11:59:24 +03:00
Joonas Koivunen	59b5a55dbf	doc: create guard => new download has been started	2023-08-29 11:57:03 +03:00
Joonas Koivunen	b9290c7005	doc: simplify comment	2023-08-29 11:55:53 +03:00
Joonas Koivunen	302a58e8ea	image/deltalayer: shuffle comments around	2023-08-29 11:55:08 +03:00
Joonas Koivunen	def51361ae	doc: drop comment in favor of drop_eviction_guard	2023-08-29 11:14:12 +03:00
Joonas Koivunen	d67d4b3eee	doc: add validation	2023-08-29 11:11:27 +03:00
Joonas Koivunen	cd1b548a8f	doc: explain DownloadedLayer::get owner param	2023-08-29 11:07:14 +03:00
Joonas Koivunen	282372aa5a	reorder: 1. DownloadedLayer, 2. ResidentLayer	2023-08-29 11:06:51 +03:00
Joonas Koivunen	1f0cd3b50e	doc: note running without remote storage again	2023-08-29 11:06:29 +03:00
Joonas Koivunen	4973419a38	doc: cancellation safety with evict_and_wait	2023-08-29 11:06:12 +03:00
Joonas Koivunen	44ef584842	doc: residentlayer vs. downloadedlayer and eviction	2023-08-29 10:51:08 +03:00
Joonas Koivunen	55c42da91b	info: stop using stat we no longer need to use it because in the latter versions we initialize to correct on-filesystem state with Layer::for_{resident,evicted}.	2023-08-29 10:49:41 +03:00
Joonas Koivunen	5c343af807	doc: check_expected_download	2023-08-29 10:49:25 +03:00
Joonas Koivunen	87ecb2e6ca	reorder: get and get_or_apply_evictedness	2023-08-29 10:49:13 +03:00
Joonas Koivunen	c659d0f218	fix: subscribe before evicting	2023-08-29 10:48:55 +03:00
Joonas Koivunen	9f7688b1d2	doc: another pass on LayerInner	2023-08-29 10:48:36 +03:00
Joonas Koivunen	3edff352b5	doc: explain what the consecutive failures are for	2023-08-29 10:08:53 +03:00
Joonas Koivunen	08680f6591	doc: typo	2023-08-28 16:55:28 +03:00
Joonas Koivunen	55105ad1c3	refactor: Result<(), NeedsDownload>	2023-08-28 16:53:32 +03:00
Joonas Koivunen	df328758f0	refactor: simplify schedule upload and tests	2023-08-28 16:33:01 +03:00
Joonas Koivunen	d5ac61d566	doc: add cancellation safe comment	2023-08-28 14:41:43 +03:00
Joonas Koivunen	355ea43ac7	eviction_task: remove confusing drop(candidates)	2023-08-28 14:36:34 +03:00
Joonas Koivunen	6f0ab326b4	doc: inmemlayer: cleanup comments	2023-08-28 14:24:38 +03:00
Joonas Koivunen	c06a4fb511	doc: delete fixme about gentlemans agreements and strings	2023-08-28 14:19:11 +03:00
Joonas Koivunen	9a714ac6b8	doc: link to LayerMap::search	2023-08-28 14:10:49 +03:00
Joonas Koivunen	8c21edc9c5	doc: cleanup, add missing "the"	2023-08-28 14:10:49 +03:00
Joonas Koivunen	6ff324a12d	doc: link to inmemorylayer	2023-08-28 14:10:49 +03:00
Joonas Koivunen	090f9a5a80	doc: remove obsolete comment	2023-08-28 14:01:56 +03:00
Joonas Koivunen	74aefa0b07	heavier_once_cell: explain away the unsynchornized	2023-08-28 13:59:08 +03:00
Joonas Koivunen	ce1abef0bd	doc: fix typo	2023-08-28 13:53:57 +03:00
Joonas Koivunen	53eacacb6b	botched rebase: lost impl AsRe<DeltaLayerInner>	2023-08-28 13:47:03 +03:00
Joonas Koivunen	d40b9a515a	refactor: split guard_against_eviction into three - download - keep_resident - download_and_keep_resident No need to bool enum.	2023-08-28 13:35:53 +03:00
Joonas Koivunen	7c2f687bd6	rename: garbage_collect => &_on_drop	2023-08-28 13:35:53 +03:00
Joonas Koivunen	effc151244	fix: allow dropping from UploadQueue by spawn_blocking	2023-08-28 13:35:53 +03:00
Joonas Koivunen	bdfc895642	layer: remove dead comment and code	2023-08-28 13:35:53 +03:00
Joonas Koivunen	96161c8cfd	restore Layer::dump	2023-08-28 13:35:47 +03:00
Joonas Koivunen	da99399d16	doc: minor fixes	2023-08-28 13:34:34 +03:00
Joonas Koivunen	7eb74d3720	test: fix allowed error typo	2023-08-28 13:34:34 +03:00
Joonas Koivunen	7b39681caf	fix: delete and only then report evicted	2023-08-28 13:34:34 +03:00
Joonas Koivunen	5ff5c580ad	test: fix test_timeline_deletion_with_files_stuck_in_upload_queue string change	2023-08-28 13:34:34 +03:00
Joonas Koivunen	6ccc6cbc69	refactor: minor cleanup, doc	2023-08-28 13:34:34 +03:00
Joonas Koivunen	2ecf6727c5	refactor: split evicting	2023-08-28 13:34:34 +03:00
Joonas Koivunen	f957616f1c	refactor: split get_or_maybe_download	2023-08-28 13:34:34 +03:00
Joonas Koivunen	b154a5e908	doc: few touches	2023-08-28 13:34:34 +03:00
Joonas Koivunen	c66e859bcc	try to apply backoff after download might not work as we could get cancelled, but doing it right before seems wrong as well. We already retry the download.	2023-08-28 13:34:34 +03:00
Joonas Koivunen	1559ef36af	fix: rename the written out file in Layer ctor	2023-08-28 13:34:34 +03:00
Joonas Koivunen	ef1c3d3914	test: use guard_against_eviction from outside	2023-08-28 13:34:34 +03:00
Joonas Koivunen	83e28083b0	test: migrate to Layer::for_resident	2023-08-28 13:34:34 +03:00
Joonas Koivunen	0950f8c752	refactor: Layer initialization	2023-08-28 13:34:34 +03:00
Joonas Koivunen	41d36b65e2	move Layer and all to storage_layer::layer	2023-08-28 13:34:34 +03:00
Joonas Koivunen	d8cb81118a	reorder, get rid of TODO	2023-08-28 13:34:34 +03:00
Joonas Koivunen	ecf34bb3e4	blanket rename	2023-08-28 13:34:34 +03:00
Joonas Koivunen	fb4d404553	refactor: LayerManager, remove arc	2023-08-28 13:34:34 +03:00
Joonas Koivunen	450f79b3f5	refactor: fix residency and metrics to layermanager	2023-08-28 13:34:34 +03:00
Joonas Koivunen	a47b7d1d4c	LayerE::drop comments	2023-08-28 13:34:34 +03:00
Joonas Koivunen	b01022d8df	drop TODO about better load time api	2023-08-28 13:34:34 +03:00
Joonas Koivunen	0155ff95e7	doc: address review comment by jcsp	2023-08-28 13:34:34 +03:00
Joonas Koivunen	46b6a1a5e8	review comment: xref tested string	2023-08-28 13:34:34 +03:00
Joonas Koivunen	290f121b59	======= address reviews	2023-08-28 13:34:34 +03:00
Joonas Koivunen	786ddeff62	layere: comment cleanup	2023-08-28 13:34:34 +03:00
Joonas Koivunen	732e155b8e	fixup remove ability to have 'static DeltaEntry	2023-08-28 13:34:32 +03:00
Joonas Koivunen	e10c5b0a9b	needsdownload: remove unused	2023-08-28 12:49:56 +03:00
Joonas Koivunen	b608eaa410	layere: remove unused LayerE::new	2023-08-28 12:49:56 +03:00
Joonas Koivunen	7b2ae073f0	cleanup unused code, comments	2023-08-28 12:49:56 +03:00
Joonas Koivunen	d4a7bdad55	fix: move metric updates to finish_compact_l0	2023-08-28 12:49:56 +03:00
Joonas Koivunen	96c9fd330c	fix: duplicate residency events on flushing l0	2023-08-28 12:49:56 +03:00
Joonas Koivunen	39b85cc6fd	layere: record residency changes with download/evict	2023-08-28 12:49:56 +03:00
Joonas Koivunen	eccb868a50	doc: consider cancellation and redownload	2023-08-28 12:49:56 +03:00
Joonas Koivunen	de93c70f2f	provide and use LayerE::for_evicted	2023-08-28 12:49:56 +03:00
Joonas Koivunen	72430eb539	layere: remove impossible error case	2023-08-28 12:49:56 +03:00
Joonas Koivunen	e2443a0147	remove ability to have 'static DeltaEntry	2023-08-28 12:49:55 +03:00
Joonas Koivunen	e5d00b6c2a	fix: compaction can and should use borrowed DeltaEntrys otherwise we risk evicting the L0 while we are reading different blobs out of it.	2023-08-28 12:46:55 +03:00
Joonas Koivunen	dc97970215	chore(ref): make pub	2023-08-28 12:46:53 +03:00
Joonas Koivunen	bc5a643c19	fix: use new LayerE::for_resident	2023-08-28 12:43:11 +03:00
Joonas Koivunen	b1134f6857	layere: add new ctor	2023-08-28 12:43:11 +03:00
Joonas Koivunen	04ab9b78fe	test: add more allowed outcomes I cannot see a quick fix to make one them winner, nor a reason why it should be done; at worst case there could be double accounting for some evicted layer should the two do it at the same time.	2023-08-28 12:43:11 +03:00
Joonas Koivunen	fa0b881c4c	remove previous generation of Layer, PersistentLayer - remove get_value_reconstruct_data for Delta, Image - remove unnecessary default trait method - remove trait PersistentLayer - remove unused {Delta,Image}Layer::new() - continued dead code removal - unify ImageLayer to be like DeltaLayer - dead code and imports cleanup - remove PathOrConfig - correct few doc links re: Layer removal	2023-08-28 12:43:11 +03:00
Joonas Koivunen	106bda1ef9	layer_map: finally remove replacement related tests	2023-08-28 12:43:11 +03:00
Joonas Koivunen	c2de71e1fe	test: fix test_timeline_deletion_with_files_stuck_in_upload_queue, global allowed_error	2023-08-28 12:43:11 +03:00
Joonas Koivunen	3eba531c3d	test: fixup layere error type introduction, string change	2023-08-28 12:43:11 +03:00
Joonas Koivunen	088bea8680	test: add hint about mismatch cause	2023-08-28 12:43:11 +03:00
Joonas Koivunen	a9b0ac92bc	layere: decrement resident size if removed make the problem of not knowing more explicit.	2023-08-28 12:43:11 +03:00
Joonas Koivunen	3045956ddd	refactor(LayerE): use new internal api	2023-08-28 12:43:11 +03:00
Joonas Koivunen	599069b612	eviction: remove comment	2023-08-28 12:43:11 +03:00
Joonas Koivunen	54873844c2	layere: introduce internal error type	2023-08-28 12:43:11 +03:00
Joonas Koivunen	dfdd41a771	layere: move task_name closer	2023-08-28 12:43:11 +03:00
Joonas Koivunen	12763ca312	layere: reset wanted_evicted only if downloading	2023-08-28 12:43:11 +03:00
Joonas Koivunen	4c80c8c1ab	test: fix changed string (no more remote layer remote) this should be the only one: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/5922954346/index.html#suites/7745dadbd815ab87f5798aa881796f47/96ce406b6d6a7427	2023-08-28 12:43:11 +03:00
Joonas Koivunen	acd2e7f222	timeline: fix test after wait_and_evict	2023-08-28 12:43:11 +03:00
Joonas Koivunen	a6b6dd2f36	timeline: reflect evict_and_wait in tests	2023-08-28 12:43:11 +03:00
Joonas Koivunen	0fd14ad74b	timeline: remove GenericRemoteStorage exposure	2023-08-28 12:43:11 +03:00
Joonas Koivunen	52eaa52573	wip: get rid of LayerE::evict	2023-08-28 12:43:11 +03:00
Joonas Koivunen	1e33692c1c	comment out the delta dumping test	2023-08-28 12:43:11 +03:00
Joonas Koivunen	f82ba477a4	http: limit genericremotestorage exposure	2023-08-28 12:43:11 +03:00
Joonas Koivunen	761644af25	dube: adjust comments	2023-08-28 12:43:11 +03:00
Joonas Koivunen	cd12d97ba7	dube: limit GenericRemoteStorage need	2023-08-28 12:43:11 +03:00
Joonas Koivunen	1e4ded860c	refactor: move LayerE::get	2023-08-28 12:43:11 +03:00
Joonas Koivunen	a0f29853b3	layere: rewrite to heavier_once_cell	2023-08-28 12:43:11 +03:00
Joonas Koivunen	c4cdf747f8	add heavier_once_cell	2023-08-28 12:43:11 +03:00
Joonas Koivunen	e658f16810	test_download_remote_layers_api: fix expected string	2023-08-28 12:43:11 +03:00
Joonas Koivunen	a4b4305422	cleanup while hunting for test_gc_cutoff problem	2023-08-28 12:43:11 +03:00
Joonas Koivunen	d8807eb651	test: fix test_broken_timeline (string matching)	2023-08-28 12:43:11 +03:00
Joonas Koivunen	1500f711f3	test: this test no longer makes sense, we dont replace	2023-08-28 12:43:11 +03:00
Joonas Koivunen	dafa42eb71	test: fix test assuming race between compaction and upload	2023-08-28 12:43:11 +03:00
Joonas Koivunen	b5e5ead2ee	delta_layer: allow unused load_keys	2023-08-28 12:43:09 +03:00
Joonas Koivunen	24251b8d17	botched rebase: added block_on code	2023-08-28 12:42:28 +03:00
Joonas Koivunen	e6378197a7	timeline: drop 2 indentation from get_reconstruct_value	2023-08-28 12:42:28 +03:00
Joonas Koivunen	4bb0cc2fe4	gc: reflect LayerE now managing remote client	2023-08-28 12:42:28 +03:00
Joonas Koivunen	9304f42ea5	tenant: remove unused imports	2023-08-28 12:42:28 +03:00
Joonas Koivunen	18f4eb2622	timeline: remove compare_arced_layers	2023-08-28 12:42:28 +03:00
Joonas Koivunen	2caa8bcc23	layer_manager: more clippy	2023-08-28 12:42:28 +03:00
Joonas Koivunen	bb222abde1	layer_manager: remove metrics, dependend by gc/compaction	2023-08-28 12:42:28 +03:00
Joonas Koivunen	dd2b4ad26f	layer_manager: cleanup unused	2023-08-28 12:42:28 +03:00
Joonas Koivunen	b65cb8ea05	storage_layer: we no longer clone for residency change	2023-08-28 12:42:28 +03:00
Joonas Koivunen	3a7efc10a0	timeline: cleanup unused imports	2023-08-28 12:42:28 +03:00
Joonas Koivunen	a682de1dba	Timeline: get_value_reconstruct_data: avoid warnings	2023-08-28 12:42:28 +03:00
Joonas Koivunen	45a542c335	tenant: pub(crate)	2023-08-28 12:42:28 +03:00
Joonas Koivunen	bdb98b288e	==== reprocessed	2023-08-28 12:42:28 +03:00
Joonas Koivunen	a06c8e9add	inmemory_layer: drop impl Layer	2023-08-28 12:42:28 +03:00
Joonas Koivunen	18eefd61eb	inmemory_layer: less warnings	2023-08-28 12:42:28 +03:00
Joonas Koivunen	a2de0574b5	inmemorylayer: flush integration, partial?	2023-08-28 12:42:28 +03:00
Joonas Koivunen	aa8e954197	layer_manager, layer_access_stats: remove witness sidequest it was interfering with me moving responsibilities back and forth	2023-08-28 12:42:28 +03:00
Joonas Koivunen	30847e59b9	remove remote_layer instead of keep fixing	2023-08-28 12:42:28 +03:00
Joonas Koivunen	d930a581f8	layer_manager: compaction changes	2023-08-28 12:42:28 +03:00
Joonas Koivunen	931c22545b	compaction: clippies	2023-08-28 12:42:28 +03:00
Joonas Koivunen	366f3c8ff8	compaction: reflect LayerE now managing remote client	2023-08-28 12:42:28 +03:00
Joonas Koivunen	26c39d7b4c	timeline: pub(crate) compaction	2023-08-28 12:42:28 +03:00
Joonas Koivunen	6ffa5138ce	compaction: upload index on success otherwise test_gc_of_remote_layers fails	2023-08-28 12:42:28 +03:00
Joonas Koivunen	975e1558cc	compaction: integration	2023-08-28 12:42:28 +03:00
Joonas Koivunen	82a955ebfe	layer_manager: resident layer flush l0 changes	2023-08-28 12:42:28 +03:00
Joonas Koivunen	82596f8807	delta_layer: reflect non-async LayerE::for_written	2023-08-28 12:42:28 +03:00
Joonas Koivunen	e3e57579a1	integrate: download_all_layers this time around with graceful cancellation.	2023-08-28 12:42:28 +03:00
Joonas Koivunen	56d551e6a3	==== reprocessed	2023-08-28 12:42:28 +03:00
Joonas Koivunen	b4a0f8baf7	integrate: Timeline::get_value_reconstruct_data	2023-08-28 12:42:28 +03:00
Joonas Koivunen	2e686ed6ea	eviction: integration - evictiontask: remove unused imports - eviction_task and dube: cleanup - timeline: pub(crate) eviction - timeline: adjust to "layere: adjust eviction" - test: remove layer_eviction_aba_fails because it can no longer happen - test: fix up evicts later test with ability to await for eviction - eviction_task: more unused imports - eviction: clippy - eviction_task: more clippy - fixup eviction: docs - eviction: hold only Arc<Layer> after checking downloadedness - refactor earlier eviction: use drop_eviction_guard instead - dube: evict in spawned tasks - timeline, eviction: evict in spawned - eviction: add more errors - evict_layers: remove witness - eviction: post-witness forgotten panic - eviction: remove blog references	2023-08-28 12:42:28 +03:00
Joonas Koivunen	c46f72d411	create_image_layers integration (multifile)	2023-08-28 12:42:28 +03:00
Joonas Koivunen	ed46713e5c	remote_timeline_client: pub(crate) on upload	2023-08-28 12:42:28 +03:00
Joonas Koivunen	c228cb7b3f	remote_timeline_client: continued integration work post ResidentLayer	2023-08-28 12:42:28 +03:00
Joonas Koivunen	c88e4a0974	layer_manager: integration	2023-08-28 12:42:28 +03:00
Joonas Koivunen	b71a2f4cb2	load_layer_map: integration	2023-08-28 12:42:23 +03:00
Joonas Koivunen	98a7b090de	========= layere	2023-08-28 12:41:31 +03:00
Joonas Koivunen	235a8cbd28	wip: LayerE	2023-08-28 12:41:31 +03:00
Joonas Koivunen	6afeb3c6f7	========= unrelated	2023-08-28 12:41:31 +03:00
Joonas Koivunen	8c42aeac9f	test: move log to assert message	2023-08-28 12:41:31 +03:00
Joonas Koivunen	f36658ac10	layerdesc: add from_filename	2023-08-28 12:41:31 +03:00
Joonas Koivunen	f8227da9da	unrelated: layer_manager: avoid arc cloning	2023-08-28 12:41:31 +03:00