make proxy protocol non-optional

proxy: metrics2 (#5179 )
## Problem We need to count metrics always when a connection is open. Not only when the transfer is 0. We also need to count bytes usage for HTTP. ## Summary of changes New structure for usage metrics. A `DashMap<Ids, Arc<Counters>>`. If the arc has 1 owner (the map) then I can conclude that no connections are open. If the counters has "open_connections" non zero, then I can conclude a new connection was opened in the last interval and should be reported on. Also, keep count of how many bytes processed for HTTP and report it here.
2026-03-16 14:50:37 +00:00 · 2023-09-28 14:20:19 +01:00 · 2023-09-28 11:38:26 +01:00 · 2023-09-28 12:25:20 +03:00 · 2023-09-28 10:07:11 +01:00 · 2023-09-28 09:52:39 +02:00
270 changed files with 25414 additions and 6587 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -14,10 +14,12 @@
 !pgxn/
 !proxy/
 !safekeeper/
+!s3_scrubber/
 !storage_broker/
 !trace/
 !vendor/postgres-v14/
 !vendor/postgres-v15/
+!vendor/postgres-v16/
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -0,0 +1,8 @@
+self-hosted-runner:
+  labels:
+    - gen3
+    - large
+    - small
+    - us-east-2
+config-variables:
+  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -70,6 +70,9 @@ runs:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest
+        # The lack of compatibility snapshot (for example, for the new Postgres version)
+        # shouldn't fail the whole job. Only relevant test should fail.
+        skip-if-does-not-exist: true

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
@@ -145,7 +148,11 @@ runs:

        if [ "${RERUN_FLAKY}" == "true" ]; then
          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
+                                              --days 7 \
+                                              --output "$TEST_OUTPUT/flaky.json" \
+                                              --pg-version "${DEFAULT_PG_VERSION}" \
+                                              --build-type "${BUILD_TYPE}"

          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -0,0 +1,31 @@
+name: Lint GitHub Workflows
+
+on:
+  push:
+    branches:
+      - main
+      - release
+    paths:
+      - '.github/workflows/*.ya?ml'
+  pull_request:
+    paths:
+      - '.github/workflows/*.ya?ml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: reviewdog/action-actionlint@v1
+        env:
+          # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046
+          # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2086
+        with:
+          fail_on_error: true
+          filter_mode: nofilter
+          level: error
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -2,7 +2,9 @@ name: Handle `approved-for-ci-run` label
 # This workflow helps to run CI pipeline for PRs made by external contributors (from forks).

 on:
-  pull_request:
+  pull_request_target:
+    branches:
+      - main
    types:
      # Default types that triggers a workflow ([1]):
      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
@@ -14,42 +16,103 @@ on:
      # Actual magic happens here:
      - labeled

+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
+  BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}

 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

-    runs-on: [ ubuntu-latest ]
+    permissions:
+      pull-requests: write # For `gh pr edit`

    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

-  create-branch:
-    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
+  create-or-update-pr-for-ci-run:
+    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.

-    runs-on: [ ubuntu-latest ]
+    permissions:
+      pull-requests: write # for `gh pr edit`
+      # For `git push` and `gh pr create` we use CI_ACCESS_TOKEN

    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

      - uses: actions/checkout@v3
        with:
          ref: main
+          token: ${{ secrets.CI_ACCESS_TOKEN }}

      - run: gh pr checkout "${PR_NUMBER}"

-      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
+      - run: git checkout -b "${BRANCH}"

-      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
+      - run: git push --force origin "${BRANCH}"
+
+      - name: Create a Pull Request for CI run (if required)
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          cat << EOF > body.md
+            This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER}
+
+            Please do not alter or merge/close it.
+
+            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
+          EOF
+
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
+          if [ -z "${ALREADY_CREATED}" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
+                                                       --body-file "body.md" \
+                                                       --head "${BRANCH}" \
+                                                       --base "main" \
+                                                       --draft
+          fi
+
+  cleanup:
+    # Close PRs and delete branchs if the original PR is closed.
+
+    permissions:
+      contents: write # for `--delete-branch` flag in `gh pr close`
+      pull-requests: write # for `gh pr close`
+
+    if: |
+      github.event.action == 'closed' &&
+      github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
+        run: |
+          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')"
+          if [ "${CLOSED}" == "false" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
+          fi
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -117,6 +117,7 @@ jobs:
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
+      tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}

    steps:
    - name: Generate matrix for pgbench benchmark
@@ -136,11 +137,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

    - name: Generate matrix for OLAP benchmarks
      id: olap-compare-matrix
@@ -152,11 +153,30 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                   { "platform": "rds-aurora"   }]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+    - name: Generate matrix for TPC-H benchmarks
+      id: tpch-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-reuse"
+          ],
+          "scale": [
+            "10"
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
+                                                   { "platform": "rds-aurora",   "scale": "10" }]')
+        fi
+
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
    needs: [ generate-matrices ]
@@ -233,7 +253,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -358,7 +382,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -372,6 +400,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: 10

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -398,7 +427,7 @@ jobs:

    strategy:
      fail-fast: false
-      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -407,6 +436,7 @@ jobs:
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
+      TEST_OLAP_SCALE: ${{ matrix.scale }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -428,18 +458,17 @@ jobs:
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

-    - name: Set up Connection String
-      id: set-up-connstr
+    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
          neon-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -447,9 +476,21 @@ jobs:
            ;;
        esac

+        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
+        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
+
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -463,6 +504,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: ${{ matrix.scale }}

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -534,7 +576,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -24,7 +23,30 @@ env:
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
+  check-permissions:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Disallow PRs from forks
+      if: |
+        github.event_name == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1
+
+
  tag:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
@@ -53,6 +75,7 @@ jobs:
        id: build-tag

  check-codestyle-python:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -85,6 +108,7 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -151,6 +175,7 @@ jobs:
        run: cargo deny check

  build-neon:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,7 +212,7 @@ jobs:
          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603

          FAILED=false
-          for postgres in postgres-v14 postgres-v15; do
+          for postgres in postgres-v14 postgres-v15 postgres-v16; do
            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
            actual=$(git rev-parse "HEAD:vendor/${postgres}")
            if [ "${expected}" != "${actual}" ]; then
@@ -209,6 +234,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      # Set some environment variables used by all the steps.
      #
      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -229,10 +258,12 @@ jobs:
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
          fi
-          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
-          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
-          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
-          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV

      # Disabled for now
      # Don't include the ~/.cargo/registry/src directory. It contains just
@@ -267,6 +298,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
        run: mold -run make postgres-v14 -j$(nproc)
@@ -275,6 +313,10 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: mold -run make postgres-v15 -j$(nproc)

+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

@@ -348,17 +390,17 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
-        pg_version: [ v14, v15 ]
+        pg_version: [ v14, v15, v16 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -386,12 +428,12 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
@@ -418,12 +460,13 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests, benchmarks ]
-    if: ${{ !cancelled() }}

    steps:
      - uses: actions/checkout@v3
@@ -449,42 +492,40 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

+            const coverage = {
+              coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}",
+              summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}",
+            }
+
            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
              fetch,
              report,
+              coverage,
            })

  coverage-report:
+    needs: [ check-permissions, regress-tests ]
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug ]
+    outputs:
+        coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
+        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: true
-          fetch-depth: 1
-
-#      Disabled for now
-#      - name: Restore cargo deps cache
-#        id: cache_cargo
-#        uses: actions/cache@v3
-#        with:
-#          path: |
-#            ~/.cargo/registry/
-#            !~/.cargo/registry/src
-#            ~/.cargo/git/
-#            target/
-#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+          fetch-depth: 0

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -527,13 +568,45 @@ jobs:
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

+      - name: Build coverage report NEW
+        id: upload-coverage-report-new
+        env:
+          BUCKET: neon-github-public-dev
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          BASELINE="$(git merge-base HEAD origin/main)"
+          CURRENT="${COMMIT_SHA}"
+
+          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
+
+          GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
+
+          # Use differential coverage if the baseline coverage exists.
+          # It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit.
+          if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then
+            git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff
+
+            GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}"
+          fi
+
+          genhtml ${GENHTML_ARGS}
+
+          aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html
+          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
+          echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
+
      - uses: actions/github-script@v6
        env:
          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
+          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
          script: |
-            const { REPORT_URL, COMMIT_SHA } = process.env
+            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env

            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
@@ -544,12 +617,21 @@ jobs:
              context: 'Code coverage report',
            })

+            await github.rest.repos.createCommitStatus({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              sha: `${COMMIT_SHA}`,
+              state: 'success',
+              target_url: `${REPORT_URL_NEW}`,
+              context: 'Code coverage report NEW',
+            })
+
  trigger-e2e-tests:
+    needs: [ check-permissions, promote-images, tag ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -590,8 +672,8 @@ jobs:
            }"

  neon-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -638,7 +720,7 @@ jobs:

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
+    needs: [ check-permissions, tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -683,17 +765,17 @@ jobs:
        run: rm -rf ~/.ecr

  compute-node-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: gcr.io/kaniko-project/executor:v1.9.2-debug
      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
      options: --add-host=download.osgeo.org:140.211.15.30
-    needs: [ tag ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -737,50 +819,22 @@ jobs:
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --cleanup

-      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
-      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
-      # so we won't build extension twice, but extract them from compute-node.
-      #
-      # For now we use extensions image only for new custom extensitons
-      - name: Kaniko build extensions only
-        run: |
-          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-          # it still fails with error:
-          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-          #
-          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-                           --context . \
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
-                           --build-arg PG_VERSION=${{ matrix.version }} \
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-                           --dockerfile Dockerfile.compute-node \
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --cleanup \
-                           --target postgres-extensions
-
      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

  vm-compute-node-image:
+    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag, compute-node-image ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.16.3
+      VM_BUILDER_VERSION: v0.17.12

    steps:
      - name: Checkout
@@ -803,7 +857,7 @@ jobs:
        run: |
          ./vm-builder \
            -enable-file-cache \
-            -enable-monitor \
+            -cgroup-uid=postgres \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

@@ -812,7 +866,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -855,8 +909,8 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
+    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: [ self-hosted, gen3, small ]
-    needs: [ tag, test-images, vm-compute-node-image ]
    container: golang:1.19-bullseye
    # Don't add if-condition here.
    # The job should always be run because we have dependant other jobs that shouldn't be skipped
@@ -876,6 +930,7 @@ jobs:
        run: |
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
        if: |
@@ -886,10 +941,10 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -900,10 +955,10 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest

      - name: Configure Docker Hub login
        run: |
@@ -915,6 +970,7 @@ jobs:
        run: |
          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
        if: |
@@ -925,66 +981,94 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  upload-postgres-extensions-to-s3:
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-       github.event_name != 'workflow_dispatch'
-    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
-    needs: [ tag, promote-images ]
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15 ]
-
-    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
-
+  trigger-custom-extensions-build-and-wait:
+    needs: [ check-permissions, tag ]
+    runs-on: ubuntu-latest
    steps:
-      - name: Pull postgres-extensions image
+      - name: Set PR's status to pending and request a remote CI test
        run: |
-          docker pull ${EXTENSIONS_IMAGE}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
+          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

-      - name: Create postgres-extensions container
-        id: create-container
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"build-and-upload-extensions\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"build-and-upload-extensions\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"remote_branch_name\": \"${{ github.ref_name }}\"
+              }
+            }"
+
+      - name: Wait for extension build to finish
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          EID=$(docker create ${EXTENSIONS_IMAGE} true)
-          echo "EID=${EID}" >> $GITHUB_OUTPUT
+          TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
+          INTERVAL=15 # try each N seconds

-      - name: Extract postgres-extensions from container
-        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
+          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context

-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
+          for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do
+            sleep $INTERVAL

-      - name: Upload postgres-extensions to S3
-        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
-            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
+            # Get statuses for the latest commit in the PR / branch
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
+
+            # Get the latest status for the "build-and-upload-extensions" context
+            last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
+            if [ "${last_status}" = "pending" ]; then
+              # Extension build is still in progress.
+              continue
+            elif [ "${last_status}" = "success" ]; then
+              # Extension build is successful.
+              exit 0
+            else
+              # Status is neither "pending" nor "success", exit the loop and fail the job.
+              break
+            fi
          done

-      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
-        run: |
-          docker rm ${{ steps.create-container.outputs.EID }} || true
+          # Extension build failed, print `statuses.json` for debugging and fail the job.
+          jq '.' statuses.json
+
+          echo >&2 "Status of extension build is '${last_status}' != 'success'"
+          exit 1

  deploy:
+    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
        run: |
@@ -1007,8 +1091,9 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
@@ -1022,20 +1107,35 @@ jobs:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
-            github.rest.git.createRef({
+            await github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })

+      - name: Create GitHub release
+        if: github.ref_name == 'release'
+        uses: actions/github-script@v6
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            await github.rest.repos.createRelease({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              tag_name: "${{ needs.tag.outputs.build-tag }}",
+              generate_release_notes: true,
+            })
+
  promote-compatibility-data:
+    needs: [ check-permissions, promote-images, tag, regress-tests ]
+    if: github.ref_name == 'release'
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag, regress-tests ]
-    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
    steps:
      - name: Promote compatibility snapshot for the release
        env:
@@ -1043,7 +1143,7 @@ jobs:
          PREFIX: artifacts/latest
        run: |
          # Update compatibility snapshot for the release
-          for pg_version in v14 v15; do
+          for pg_version in v14 v15 v16; do
            for build_type in debug release; do
              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -4,7 +4,6 @@ on:
  push:
    branches:
      - main
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -39,7 +38,7 @@ jobs:
          fetch-depth: 1

      - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf
+        run: brew install flex bison openssl protobuf icu4c pkg-config

      - name: Set pg 14 revision for caching
        id: pg_v14_rev
@@ -49,6 +48,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      - name: Cache postgres v14 build
        id: cache_pg_14
        uses: actions/cache@v3
@@ -63,6 +66,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Set extra env for macOS
        run: |
          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -86,6 +96,10 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: make postgres-v15 -j$(nproc)

+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: make postgres-v16 -j$(nproc)
+
      - name: Build neon extensions
        run: make neon-pg-ext -j$(nproc)

--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -0,0 +1,29 @@
+name: Notify Slack channel about upcoming release
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number }}
+  cancel-in-progress: true
+
+on:
+  pull_request:
+    branches:
+      - release
+    types:
+      # Default types that triggers a workflow:
+      # - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
+      - opened
+      - synchronize
+      - reopened
+      # Additional types that we want to handle:
+      - closed
+
+jobs:
+  notify:
+    runs-on: [ ubuntu-latest ]
+
+    steps:
+      - uses: neondatabase/dev-actions/release-pr-notify@main
+        with:
+          slack-token: ${{ secrets.SLACK_BOT_TOKEN }}
+          slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications`
+          github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,16 +2,19 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 10 * * 2'
+    - cron: '0 7 * * 2'
  workflow_dispatch:

 jobs:
  create_release_branch:
-    runs-on: [ubuntu-latest]
+    runs-on: [ ubuntu-latest ]
+
+    permissions:
+      contents: write # for `git push`

    steps:
    - name: Check out code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
      with:
        ref: main

@@ -26,9 +29,16 @@ jobs:
      run: git push origin releases/${{ steps.date.outputs.date }}

    - name: Create pull request into release
-      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
-      with:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        head: releases/${{ steps.date.outputs.date }}
-        base: release
-        title: Release ${{ steps.date.outputs.date }}
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Release ${{ steps.date.outputs.date }}
+
+          **Please merge this PR using 'Create a merge commit'!**
+        EOF
+
+        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+                     --body-file "body.md" \
+                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --base "release"
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,7 @@
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_15_STABLE_neon
+[submodule "vendor/postgres-v16"]
+	path = vendor/postgres-v16
+	url = https://github.com/neondatabase/postgres.git
+	branch = REL_16_STABLE_neon
--- a/13
+++ b/13
@@ -1,11 +1,12 @@
-/compute_tools/ @neondatabase/control-plane
+/compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/ @neondatabase/compute @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute 
-/libs/remote_storage/ @neondatabase/storage 
-/libs/safekeeper_api/ @neondatabase/safekeepers  
-/pageserver/ @neondatabase/compute @neondatabase/storage 
+/libs/postgres_ffi/ @neondatabase/compute
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/safekeepers
+/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
+/pageserver/ @neondatabase/compute @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/proxy/ @neondatabase/control-plane 
+/proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,3 +27,28 @@ your patch's fault. Help to fix the root cause if something else has
 broken the CI, before pushing.

 *Happy Hacking!*
+
+# How to run a CI pipeline on Pull Requests from external contributors
+_An instruction for maintainers_
+
+## TL;DR:
+- Review the PR
+- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
+    - Press the "Approve and run" button in GitHub UI
+    - Add the `approved-for-ci-run` label to the PR
+
+Repeat all steps after any change to the PR.
+- When the changes are ready to get merged — merge the original PR (not the internal one)
+
+## Longer version:
+
+GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons).
+So, passing the CI pipeline on Pull Requests from external contributors is impossible.
+
+We're using the following approach to make it work:
+- After the review, assign the `approved-for-ci-run` label to the PR if changes look safe
+- A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`)
+- Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI)
+- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
+
+For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,4 +1,5 @@
 [workspace]
+resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
@@ -7,6 +8,7 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
+    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -23,6 +25,7 @@ members = [
    "libs/remote_storage",
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
+    "libs/vm_monitor",
 ]

 [workspace.package]
@@ -36,17 +39,19 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.27"
-aws-smithy-http = "0.55"
-aws-credential-types = "0.55"
-aws-types = "0.55"
+aws-config = { version = "0.56", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.29"
+aws-smithy-http = "0.56"
+aws-credential-types = "0.56"
+aws-types = "0.56"
+axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
+cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
 close_fds = "0.3.2"
@@ -73,7 +78,8 @@ hostname = "0.3.1"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.9"
+hyper-tungstenite = "0.11"
+inotify = "0.10.2"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -101,16 +107,19 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
-rustls = "0.20"
+rustc-hash = "1.1.0"
+rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
-sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sysinfo = "0.29.2"
+sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
+smallvec = "1.11"
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
@@ -119,11 +128,11 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.1"
 thiserror = "1.0"
-tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
+tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.9.0"
-tokio-rustls = "0.23"
+tokio-postgres-rustls = "0.10.0"
+tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
@@ -133,11 +142,11 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.23"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -169,14 +178,15 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
+vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.10"
-rstest = "0.17"
+rcgen = "0.11"
+rstest = "0.18"
 tempfile = "3.4"
 tonic-build = "0.9"

--- a/4
+++ b/4
@@ -12,6 +12,7 @@ WORKDIR /home/nonroot

 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
+COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -39,6 +40,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
@@ -65,6 +67,7 @@ RUN set -e \
    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
+        libicu67 \
        openssl \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
@@ -81,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
+COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,8 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar

 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
-    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
@@ -124,8 +124,21 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
-    echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLV8_VERSION=3.1.5 \
+        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
+        ;; \
+      "v16") \
+        export PLV8_VERSION=3.1.8 \
+        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
+    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -172,8 +185,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
    cp -R /h3/usr / && \
    rm -rf build

-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
-    echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -211,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
-    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
+    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -243,8 +256,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
-    echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -307,8 +320,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \
-    echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,8 +336,8 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O i
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
-    echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -339,8 +352,8 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O pr
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
-    echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -371,12 +384,21 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz
 FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
+    esac && \
+    apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
-    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
@@ -405,6 +427,10 @@ RUN case "${PG_VERSION}" in \
        export PG_HINT_PLAN_VERSION=15_1_5_0 \
        export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
        ;; \
+      "v16") \
+        export PG_HINT_PLAN_VERSION=16_1_6_0 \
+        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
+        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
        ;; \
@@ -452,8 +478,8 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
-    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -479,8 +505,8 @@ RUN apt-get update && \
        libfreetype6-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
-    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -551,12 +577,19 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
-    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PG_EMBEDDING_VERSION=0.3.5 \
+        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
+        ;; \
+      *) \
+        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
+    esac && \
+    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
+    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+    make -j $(getconf _NPROCESSORS_ONLN) install

 #########################################################################################
 #
@@ -584,6 +617,10 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 # Layer "rust extensions"
 # This layer is used to build `pgx` deps
 #
+# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
+# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
+# dependency on all the rust extension that depend on it, too.
+#
 #########################################################################################
 FROM build-deps AS rust-extensions-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -598,7 +635,17 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION

-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
+        ;; \
+    esac && \
+    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
@@ -615,10 +662,21 @@ USER root
 #########################################################################################

 FROM rust-extensions-build AS pg-jsonschema-pg-build
+ARG PG_VERSION

 # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
 # there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -633,12 +691,23 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e
 #########################################################################################

 FROM rust-extensions-build AS pg-graphql-pg-build
+ARG PG_VERSION

 # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
 # Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
 # pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
 # same 1.1 version we've used before.
-RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -656,9 +725,20 @@ RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367
 #########################################################################################

 FROM rust-extensions-build AS pg-tiktoken-pg-build
+ARG PG_VERSION

 # 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    cargo pgx install --release && \
@@ -672,8 +752,19 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
 #########################################################################################

 FROM rust-extensions-build AS pg-pgx-ulid-build
+ARG PG_VERSION

-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -726,6 +817,20 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_rmgr \
+        -s install && \
+    case "${PG_VERSION}" in \
+        "v14" | "v15") \
+        ;; \
+        "v16") \
+            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
+        ;; \
+        *) \
+            echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+        esac && \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/hnsw \
@@ -764,29 +869,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
-#
-# Extenstion only
-#
-#########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
-FROM scratch AS postgres-extensions
-# After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
-
 #########################################################################################
 #
 # Final layer
--- a/41
+++ b/41
@@ -29,6 +29,7 @@ else ifeq ($(UNAME_S),Darwin)
 	# It can be configured with OPENSSL_PREFIX variable
 	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
 	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
 	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
@@ -83,6 +84,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
+.PHONY: postgres-configure-v16
+postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
 .PHONY: postgres-configure-v14
@@ -118,6 +121,10 @@ postgres-clean-%:
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean

+.PHONY: postgres-check-%
+postgres-check-%: postgres-%
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
+
 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
 	+@echo "Compiling neon $*"
@@ -130,6 +137,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_rmgr $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
 	+@echo "Compiling neon_test_utils $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -140,11 +152,6 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
-	+@echo "Compiling hnsw $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -160,35 +167,43 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
-	neon-pg-ext-v15
+	neon-pg-ext-v15 \
+	neon-pg-ext-v16

 .PHONY: neon-pg-ext-clean
 neon-pg-ext-clean: \
 	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15
+	neon-pg-ext-clean-v15 \
+	neon-pg-ext-clean-v16

 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
-	postgres-v15
+	postgres-v15 \
+	postgres-v16

 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
-	postgres-headers-v15
+	postgres-headers-v15 \
+	postgres-headers-v16

 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
-	postgres-clean-v15
+	postgres-clean-v15 \
+	postgres-clean-v16
+
+.PHONY: postgres-check
+postgres-check: \
+	postgres-check-v14 \
+	postgres-check-v15 \
+	postgres-check-v16

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
--- a/README.md
+++ b/README.md
@@ -29,18 +29,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry
+libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf curl
+postgresql-libs cmake postgresql protobuf curl lsof
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf openssl flex bison
+brew install protobuf openssl flex bison icu4c pkg-config

 # add openssl to PATH, required for ed25519 keys generation in neon_local
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
--- a/clippy.toml
+++ b/clippy.toml
@@ -0,0 +1,5 @@
+disallowed-methods = [
+    "tokio::task::block_in_place",
+    # Allow this for now, to deny it later once we stop using Handle::block_on completely
+    # "tokio::runtime::Handle::block_on",
+]
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 chrono.workspace = true
+cfg-if.workspace = true
 clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
@@ -23,6 +24,7 @@ tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -34,4 +36,5 @@ utils.workspace = true
 workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -19,9 +19,10 @@ Also `compute_ctl` spawns two separate service threads:
 - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
  last activity requests.

-If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
-compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
-downscaling and (eventually) will request immediate upscaling under resource pressure.
+If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
+`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
+`vm-monitor` communicates with the VM autoscaling system. It coordinates
+downscaling and requests immediate upscaling under resource pressure.

 Usage example:
 ```sh
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -20,9 +20,10 @@
 //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
 //!   last activity requests.
 //!
-//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
-//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
-//! downscaling and (eventually) will request immediate upscaling under resource pressure.
+//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
+//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
+//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
+//! downscaling and requests immediate upscaling under resource pressure.
 //!
 //! Usage example:
 //! ```sh
@@ -35,7 +36,6 @@
 //!
 use std::collections::HashMap;
 use std::fs::File;
-use std::panic;
 use std::path::Path;
 use std::process::exit;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
@@ -271,6 +271,57 @@ fn main() -> Result<()> {
        }
    };

+    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
+    // because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            use std::env;
+            use tokio_util::sync::CancellationToken;
+            use tracing::warn;
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
+            let cgroup = matches.get_one::<String>("cgroup");
+            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
+
+            // Only make a runtime if we need to.
+            // Note: it seems like you can make a runtime in an inner scope and
+            // if you start a task in it it won't be dropped. However, make it
+            // in the outermost scope just to be safe.
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
+                (None, None) => None,
+                (None, Some(_)) => {
+                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
+                    None
+                }
+                (Some(_), None) => {
+                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
+                }
+                (Some(_), Some(_)) => Some(
+                    tokio::runtime::Builder::new_multi_thread()
+                        .worker_threads(4)
+                        .enable_all()
+                        .build()
+                        .expect("failed to create tokio runtime for monitor"),
+                ),
+            };
+
+            // This token is used internally by the monitor to clean up all threads
+            let token = CancellationToken::new();
+
+            let vm_monitor = &rt.as_ref().map(|rt| {
+                rt.spawn(vm_monitor::start(
+                    Box::leak(Box::new(vm_monitor::Args {
+                        cgroup: cgroup.cloned(),
+                        pgconnstr: file_cache_connstr.cloned(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
+                        file_cache_on_disk,
+                    })),
+                    token.clone(),
+                ))
+            });
+        }
+    }
+
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
    if let Some(mut pg) = pg {
@@ -284,6 +335,24 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }

+    // Terminate the vm_monitor so it releases the file watcher on
+    // /sys/fs/cgroup/neon-postgres.
+    // Note: the vm-monitor only runs on linux because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            if let Some(handle) = vm_monitor {
+                // Kills all threads spawned by the monitor
+                token.cancel();
+                // Kills the actual task running the monitor
+                handle.abort();
+
+                // If handle is some, rt must have been used to produce it, and
+                // hence is also some
+                rt.unwrap().shutdown_timeout(Duration::from_secs(2));
+            }
+        }
+    }
+
    // Maybe sync safekeepers again, to speed up next startup
    let compute_state = compute.state.lock().unwrap().clone();
    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -393,6 +462,34 @@ fn cli() -> clap::Command {
                .long("remote-ext-config")
                .value_name("REMOTE_EXT_CONFIG"),
        )
+        // TODO(fprasx): we currently have default arguments because the cloud PR
+        // to pass them in hasn't been merged yet. We should get rid of them once
+        // the PR is merged.
+        .arg(
+            Arg::new("vm-monitor-addr")
+                .long("vm-monitor-addr")
+                .default_value("0.0.0.0:10301")
+                .value_name("VM_MONITOR_ADDR"),
+        )
+        .arg(
+            Arg::new("cgroup")
+                .long("cgroup")
+                .default_value("neon-postgres")
+                .value_name("CGROUP"),
+        )
+        .arg(
+            Arg::new("filecache-connstr")
+                .long("filecache-connstr")
+                .default_value(
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
+                )
+                .value_name("FILECACHE_CONNSTR"),
+        )
+        .arg(
+            Arg::new("file-cache-on-disk")
+                .long("file-cache-on-disk")
+                .action(clap::ArgAction::SetTrue),
+        )
 }

 #[test]
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,12 +1,39 @@
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, Ok, Result};
+use postgres::Client;
 use tokio_postgres::NoTls;
-use tracing::{error, instrument};
+use tracing::{error, instrument, warn};

 use crate::compute::ComputeNode;

+/// Create a special service table for availability checks
+/// only if it does not exist already.
+pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
+    let query = "
+        DO $$
+        BEGIN
+            IF NOT EXISTS(
+                SELECT 1
+                FROM pg_catalog.pg_tables
+                WHERE tablename = 'health_check'
+            )
+            THEN
+            CREATE TABLE health_check (
+                id serial primary key,
+                updated_at timestamptz default now()
+            );
+            INSERT INTO health_check VALUES (1, now())
+                ON CONFLICT (id) DO UPDATE
+                 SET updated_at = now();
+            END IF;
+        END
+        $$;";
+    client.execute(query, &[])?;
+
+    Ok(())
+}
+
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
-/// Create table if it's missing.
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    // Connect to the database.
@@ -24,21 +51,28 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    });

    let query = "
-    CREATE TABLE IF NOT EXISTS health_check (
-        id serial primary key,
-        updated_at timestamptz default now()
-    );
    INSERT INTO health_check VALUES (1, now())
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";

-    let result = client.simple_query(query).await?;
-
-    if result.len() != 2 {
-        return Err(anyhow::format_err!(
-            "expected 2 query results, but got {}",
-            result.len()
-        ));
+    match client.simple_query(query).await {
+        Result::Ok(result) => {
+            if result.len() != 1 {
+                return Err(anyhow::anyhow!(
+                    "expected 1 query results, but got {}",
+                    result.len()
+                ));
+            }
+        }
+        Err(err) => {
+            if let Some(state) = err.code() {
+                if state == &tokio_postgres::error::SqlState::DISK_FULL {
+                    warn!("Tenant disk is full");
+                    return Ok(());
+                }
+            }
+            return Err(err.into());
+        }
    }

    Ok(())
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::env;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
@@ -26,6 +27,7 @@ use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};

+use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -175,6 +177,27 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

+/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
+/// cgroup. Otherwise returns the default `Command::new(cmd)`
+///
+/// This function should be used to start postgres, as it will start it in the
+/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
+/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
+/// creates it during the sysinit phase of its inittab.
+fn maybe_cgexec(cmd: &str) -> Command {
+    // The cplane sets this env var for autoscaling computes.
+    // use `var_os` so we don't have to worry about the variable being valid
+    // unicode. Should never be an concern . . . but just in case
+    if env::var_os("AUTOSCALING").is_some() {
+        let mut command = Command::new("cgexec");
+        command.args(["-g", "memory:neon-postgres"]);
+        command.arg(cmd);
+        command
+    } else {
+        Command::new(cmd)
+    }
+}
+
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
@@ -451,7 +474,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let sync_handle = Command::new(&self.pgbin)
+        let sync_handle = maybe_cgexec(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -586,7 +609,7 @@ impl ComputeNode {

        // Start postgres
        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.pgbin)
            .args(["-D", pgdata])
            .spawn()
            .expect("cannot start postgres process");
@@ -614,7 +637,7 @@ impl ComputeNode {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
-        let mut pg = Command::new(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.pgbin)
            .args(["-D", &self.pgdata])
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
@@ -674,6 +697,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -1056,7 +1080,8 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -46,8 +46,6 @@ pub fn write_postgres_conf(
        writeln!(file, "{}", conf)?;
    }

-    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
-
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -74,6 +74,7 @@ More specifically, here is an example ext_index.json
 use anyhow::Context;
 use anyhow::{self, Result};
 use compute_api::spec::RemoteExtSpec;
+use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
@@ -106,16 +107,71 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {

 pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
-    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("15") {
-        return "v15".to_string();
-    } else if human_version.contains("14") {
-        return "v14".to_string();
+    return parse_pg_version(&human_version).to_string();
+}
+
+fn parse_pg_version(human_version: &str) -> &str {
+    // Normal releases have version strings like "PostgreSQL 15.4". But there
+    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
+    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
+    // configure option, you can tack any string to the version number,
+    // e.g. "PostgreSQL 15.4foobar".
+    match Regex::new(r"^PostgreSQL (?<major>\d+).+")
+        .unwrap()
+        .captures(human_version)
+    {
+        Some(captures) if captures.len() == 2 => match &captures["major"] {
+            "14" => return "v14",
+            "15" => return "v15",
+            "16" => return "v16",
+            _ => {}
+        },
+        _ => {}
    }
    panic!("Unsuported postgres version {human_version}");
 }

+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
+}
+
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
@@ -180,7 +236,19 @@ pub async fn download_extension(
 // Create extension control files from spec
 pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    for ext_data in remote_extensions.extension_data.values() {
+    for (ext_name, ext_data) in remote_extensions.extension_data.iter() {
+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &remote_extensions.public_extensions {
+            if !public_extensions.contains(ext_name) {
+                if let Some(custom_extensions) = &remote_extensions.custom_extensions {
+                    if !custom_extensions.contains(ext_name) {
+                        continue; // skip this extension, it is not allowed
+                    }
+                }
+            }
+        }
+
        for (control_name, control_content) in &ext_data.control_data {
            let control_path = local_sharedir.join(control_name);
            if !control_path.exists() {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,4 +1,6 @@
 use std::convert::Infallible;
+use std::net::IpAddr;
+use std::net::Ipv6Addr;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
@@ -169,7 +171,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    }
                };

-                remote_extensions.get_ext(&filename, is_library)
+                remote_extensions.get_ext(
+                    &filename,
+                    is_library,
+                    &compute.build_tag,
+                    &compute.pgversion,
+                )
            };

            match ext {
@@ -293,7 +300,9 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], port));
+    // this usually binds to both IPv4 and IPv6 on linux
+    // see e.g. https://github.com/rust-lang/rust/pull/34440
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
 //   https://www.postgresql.org/docs/15/auth-password.html
 //
 // So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
-pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
+pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,6 +12,8 @@ git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
+hex.workspace = true
+hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
 serde.workspace = true
@@ -20,6 +22,7 @@ serde_with.workspace = true
 tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
+tokio.workspace = true
 url.workspace = true
 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,6 +1,7 @@
 # Minimal neon environment with one safekeeper. This is equivalent to the built-in
 # defaults that you get with no --config
-[pageserver]
+[[pageservers]]
+id=1
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
 pg_auth_type = 'Trust'
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -0,0 +1,105 @@
+use crate::{background_process, local_env::LocalEnv};
+use anyhow::anyhow;
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use std::{path::PathBuf, process::Child};
+use utils::id::{NodeId, TenantId};
+
+pub struct AttachmentService {
+    env: LocalEnv,
+    listen: String,
+    path: PathBuf,
+}
+
+const COMMAND: &str = "attachment_service";
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub struct AttachHookRequest {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    pub pageserver_id: Option<NodeId>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct AttachHookResponse {
+    pub gen: Option<u32>,
+}
+
+impl AttachmentService {
+    pub fn from_env(env: &LocalEnv) -> Self {
+        let path = env.base_data_dir.join("attachments.json");
+
+        // Makes no sense to construct this if pageservers aren't going to use it: assume
+        // pageservers have control plane API set
+        let listen_url = env.control_plane_api.clone().unwrap();
+
+        let listen = format!(
+            "{}:{}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        );
+
+        Self {
+            env: env.clone(),
+            path,
+            listen,
+        }
+    }
+
+    fn pid_file(&self) -> PathBuf {
+        self.env.base_data_dir.join("attachment_service.pid")
+    }
+
+    pub fn start(&self) -> anyhow::Result<Child> {
+        let path_str = self.path.to_string_lossy();
+
+        background_process::start_process(
+            COMMAND,
+            &self.env.base_data_dir,
+            &self.env.attachment_service_bin(),
+            ["-l", &self.listen, "-p", &path_str],
+            [],
+            background_process::InitialPidFile::Create(&self.pid_file()),
+            // TODO: a real status check
+            || Ok(true),
+        )
+    }
+
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())
+    }
+
+    /// Call into the attach_hook API, for use before handing out attachments to pageservers
+    pub fn attach_hook(
+        &self,
+        tenant_id: TenantId,
+        pageserver_id: NodeId,
+    ) -> anyhow::Result<Option<u32>> {
+        use hyper::StatusCode;
+
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("attach_hook")
+            .unwrap();
+        let client = reqwest::blocking::ClientBuilder::new()
+            .build()
+            .expect("Failed to construct http client");
+
+        let request = AttachHookRequest {
+            tenant_id,
+            pageserver_id: Some(pageserver_id),
+        };
+
+        let response = client.post(url).json(&request).send()?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }
+
+        let response = response.json::<AttachHookResponse>()?;
+        Ok(response.gen)
+    }
+}
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -0,0 +1,274 @@
+/// The attachment service mimics the aspects of the control plane API
+/// that are required for a pageserver to operate.
+///
+/// This enables running & testing pageservers without a full-blown
+/// deployment of the Neon cloud platform.
+///
+use anyhow::anyhow;
+use clap::Parser;
+use hex::FromHex;
+use hyper::StatusCode;
+use hyper::{Body, Request, Response};
+use serde::{Deserialize, Serialize};
+use std::path::{Path, PathBuf};
+use std::{collections::HashMap, sync::Arc};
+use utils::logging::{self, LogFormat};
+
+use utils::{
+    http::{
+        endpoint::{self},
+        error::ApiError,
+        json::{json_request, json_response},
+        RequestExt, RouterBuilder,
+    },
+    id::{NodeId, TenantId},
+    tcp_listener,
+};
+
+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
+    ValidateResponseTenant,
+};
+
+use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    /// Host and port to listen on, like `127.0.0.1:1234`
+    #[arg(short, long)]
+    listen: std::net::SocketAddr,
+
+    /// Path to the .json file to store state (will be created if it doesn't exist)
+    #[arg(short, long)]
+    path: PathBuf,
+}
+
+// The persistent state of each Tenant
+#[derive(Serialize, Deserialize, Clone)]
+struct TenantState {
+    // Currently attached pageserver
+    pageserver: Option<NodeId>,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    generation: u32,
+}
+
+fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+    V: Clone + Serialize,
+{
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+
+    transformed
+        .collect::<HashMap<String, V>>()
+        .serialize(serializer)
+}
+
+fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+    V: Deserialize<'de>,
+{
+    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
+    hex_map
+        .into_iter()
+        .map(|(k, v)| {
+            TenantId::from_hex(k)
+                .map(|k| (k, v))
+                .map_err(serde::de::Error::custom)
+        })
+        .collect()
+}
+
+// Top level state available to all HTTP handlers
+#[derive(Serialize, Deserialize)]
+struct PersistentState {
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    tenants: HashMap<TenantId, TenantState>,
+
+    #[serde(skip)]
+    path: PathBuf,
+}
+
+impl PersistentState {
+    async fn save(&self) -> anyhow::Result<()> {
+        let bytes = serde_json::to_vec(self)?;
+        tokio::fs::write(&self.path, &bytes).await?;
+
+        Ok(())
+    }
+
+    async fn load(path: &Path) -> anyhow::Result<Self> {
+        let bytes = tokio::fs::read(path).await?;
+        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
+        decoded.path = path.to_owned();
+        Ok(decoded)
+    }
+
+    async fn load_or_new(path: &Path) -> Self {
+        match Self::load(path).await {
+            Ok(s) => {
+                tracing::info!("Loaded state file at {}", path.display());
+                s
+            }
+            Err(e)
+                if e.downcast_ref::<std::io::Error>()
+                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
+                    .unwrap_or(false) =>
+            {
+                tracing::info!("Will create state file at {}", path.display());
+                Self {
+                    tenants: HashMap::new(),
+                    path: path.to_owned(),
+                }
+            }
+            Err(e) => {
+                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
+            }
+        }
+    }
+}
+
+/// State available to HTTP request handlers
+#[derive(Clone)]
+struct State {
+    inner: Arc<tokio::sync::RwLock<PersistentState>>,
+}
+
+impl State {
+    fn new(persistent_state: PersistentState) -> State {
+        Self {
+            inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
+        }
+    }
+}
+
+#[inline(always)]
+fn get_state(request: &Request<Body>) -> &State {
+    request
+        .data::<Arc<State>>()
+        .expect("unknown state type")
+        .as_ref()
+}
+
+/// Pageserver calls into this on startup, to learn which tenants it should attach
+async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let mut locked = state.write().await;
+
+    let mut response = ReAttachResponse {
+        tenants: Vec::new(),
+    };
+    for (t, state) in &mut locked.tenants {
+        if state.pageserver == Some(reattach_req.node_id) {
+            state.generation += 1;
+            response.tenants.push(ReAttachResponseTenant {
+                id: *t,
+                generation: state.generation,
+            });
+        }
+    }
+
+    locked.save().await.map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
+/// Pageserver calls into this before doing deletions, to confirm that it still
+/// holds the latest generation for the tenants with deletions enqueued
+async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
+
+    let locked = get_state(&req).inner.read().await;
+
+    let mut response = ValidateResponse {
+        tenants: Vec::new(),
+    };
+
+    for req_tenant in validate_req.tenants {
+        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
+            let valid = tenant_state.generation == req_tenant.gen;
+            response.tenants.push(ValidateResponseTenant {
+                id: req_tenant.id,
+                valid,
+            });
+        }
+    }
+
+    json_response(StatusCode::OK, response)
+}
+/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
+/// (in the real control plane this is unnecessary, because the same program is managing
+///  generation numbers and doing attachments).
+async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let mut locked = state.write().await;
+
+    let tenant_state = locked
+        .tenants
+        .entry(attach_req.tenant_id)
+        .or_insert_with(|| TenantState {
+            pageserver: attach_req.pageserver_id,
+            generation: 0,
+        });
+
+    if attach_req.pageserver_id.is_some() {
+        tenant_state.generation += 1;
+    }
+    tenant_state.pageserver = attach_req.pageserver_id;
+    let generation = tenant_state.generation;
+
+    locked.save().await.map_err(ApiError::InternalServerError)?;
+
+    json_response(
+        StatusCode::OK,
+        AttachHookResponse {
+            gen: attach_req.pageserver_id.map(|_| generation),
+        },
+    )
+}
+
+fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
+    endpoint::make_router()
+        .data(Arc::new(State::new(persistent_state)))
+        .post("/re-attach", handle_re_attach)
+        .post("/validate", handle_validate)
+        .post("/attach_hook", handle_attach_hook)
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+    )?;
+
+    let args = Cli::parse();
+    tracing::info!(
+        "Starting, state at {}, listening on {}",
+        args.path.to_string_lossy(),
+        args.listen
+    );
+
+    let persistent_state = PersistentState::load_or_new(&args.path).await;
+
+    let http_listener = tcp_listener::bind(args.listen)?;
+    let router = make_router(persistent_state)
+        .build()
+        .map_err(|err| anyhow!(err))?;
+    let service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+
+    tracing::info!("Serving on {0}", args.listen);
+    server.await?;
+
+    Ok(())
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,6 +8,7 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use compute_api::spec::ComputeMode;
+use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
@@ -43,14 +44,18 @@ project_git_version!(GIT_VERSION);

 const DEFAULT_PG_VERSION: &str = "15";

+const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
+
 fn default_conf() -> String {
    format!(
        r#"
 # Default built-in configuration, defined in main.rs
+control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
+
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[pageserver]
+[[pageservers]]
 id = {DEFAULT_PAGESERVER_ID}
 listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
 listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
@@ -61,6 +66,7 @@ http_auth_type = '{trust_auth}'
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
+
 "#,
        trust_auth = AuthType::Trust,
    )
@@ -107,6 +113,7 @@ fn main() -> Result<()> {
            "start" => handle_start_all(sub_args, &env),
            "stop" => handle_stop_all(sub_args, &env),
            "pageserver" => handle_pageserver(sub_args, &env),
+            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
@@ -252,7 +259,7 @@ fn get_timeline_infos(
    env: &local_env::LocalEnv,
    tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
-    Ok(PageServerNode::from_env(env)
+    Ok(get_default_pageserver(env)
        .timeline_list(tenant_id)?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -313,17 +320,30 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
-    let pageserver = PageServerNode::from_env(&env);
-    pageserver
-        .initialize(&pageserver_config_overrides(init_match))
-        .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {e:?}");
-            exit(1);
-        });
+    for ps_conf in &env.pageservers {
+        PageServerNode::from_env(&env, ps_conf)
+            .initialize(&pageserver_config_overrides(init_match))
+            .unwrap_or_else(|e| {
+                eprintln!("pageserver init failed: {e:?}");
+                exit(1);
+            });
+    }

    Ok(env)
 }

+/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
+/// For typical interactive use, one would just run with a single pageserver.  Scenarios with
+/// tenant/timeline placement across multiple pageservers are managed by python test code rather
+/// than this CLI.
+fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
+    let ps_conf = env
+        .pageservers
+        .first()
+        .expect("Config is validated to contain at least one pageserver");
+    PageServerNode::from_env(env, ps_conf)
+}
+
 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
        .get_many::<String>("pageserver-config-override")
@@ -334,7 +354,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
            for t in pageserver.tenant_list()? {
@@ -342,13 +362,25 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
            }
        }
        Some(("create", create_match)) => {
-            let initial_tenant_id = parse_tenant_id(create_match)?;
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();
-            let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
-            println!("tenant {new_tenant_id} successfully created on the pageserver");
+
+            // If tenant ID was not specified, generate one
+            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
+
+            let generation = if env.control_plane_api.is_some() {
+                // We must register the tenant with the attachment service, so
+                // that when the pageserver restarts, it will be re-attached.
+                let attachment_service = AttachmentService::from_env(env);
+                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
+            } else {
+                None
+            };
+
+            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
+            println!("tenant {tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
            let new_timeline_id = parse_timeline_id(create_match)?;
@@ -358,7 +390,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .context("Failed to parse postgres version from the argument string")?;

            let timeline_info = pageserver.timeline_create(
-                new_tenant_id,
+                tenant_id,
                new_timeline_id,
                None,
                None,
@@ -369,17 +401,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
-                new_tenant_id,
+                tenant_id,
                new_timeline_id,
            )?;

            println!(
-                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
+                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
            );

            if create_match.get_flag("set-default") {
-                println!("Setting tenant {new_tenant_id} as a default one");
-                env.default_tenant_id = Some(new_tenant_id);
+                println!("Setting tenant {tenant_id} as a default one");
+                env.default_tenant_id = Some(tenant_id);
            }
        }
        Some(("set-default", set_default_match)) => {
@@ -407,7 +439,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
 }

 fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
@@ -484,6 +516,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                pg_version,
                ComputeMode::Primary,
+                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -537,7 +570,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
        Some(ep_subcommand_data) => ep_subcommand_data,
        None => bail!("no endpoint subcommand provided"),
    };
-
    let mut cplane = ComputeControlPlane::load(env.clone())?;

    // All subcommands take an optional --tenant-id option
@@ -634,6 +666,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -649,6 +688,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                http_port,
                pg_version,
                mode,
+                pageserver_id,
            )?;
        }
        "start" => {
@@ -658,6 +698,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

            // If --safekeepers argument is given, use only the listed safekeeper nodes.
@@ -677,7 +724,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(

            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

-            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
+            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
@@ -744,6 +792,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    http_port,
                    pg_version,
                    mode,
+                    pageserver_id,
                )?;
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
@@ -768,51 +817,94 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
 }

 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+            NodeId(id_str.parse().context("while parsing pageserver id")?)
+        } else {
+            DEFAULT_PAGESERVER_ID
+        };
+
+        Ok(PageServerNode::from_env(
+            env,
+            env.get_pageserver_conf(node_id)?,
+        ))
+    }

    match sub_match.subcommand() {
-        Some(("start", start_match)) => {
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
+        Some(("start", subcommand_args)) => {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(&pageserver_config_overrides(subcommand_args))
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

+        Some(("stop", subcommand_args)) => {
+            let immediate = subcommand_args
+                .get_one::<String>("stop-mode")
+                .map(|s| s.as_str())
+                == Some("immediate");
+
+            if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+        }
+
+        Some(("restart", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
+        Some(("status", subcommand_args)) => {
+            match get_pageserver(env, subcommand_args)?.check_status() {
+                Ok(_) => println!("Page server is up and running"),
+                Err(err) => {
+                    eprintln!("Page server is not available: {}", err);
+                    exit(1);
+                }
+            }
+        }
+
+        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
+        None => bail!("no pageserver subcommand provided"),
+    }
+    Ok(())
+}
+
+fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let svc = AttachmentService::from_env(env);
+    match sub_match.subcommand() {
+        Some(("start", _start_match)) => {
+            if let Err(e) = svc.start() {
+                eprintln!("start failed: {e}");
+                exit(1);
+            }
+        }
+
        Some(("stop", stop_match)) => {
            let immediate = stop_match
                .get_one::<String>("stop-mode")
                .map(|s| s.as_str())
                == Some("immediate");

-            if let Err(e) = pageserver.stop(immediate) {
-                eprintln!("pageserver stop failed: {}", e);
+            if let Err(e) = svc.stop(immediate) {
+                eprintln!("stop failed: {}", e);
                exit(1);
            }
        }
-
-        Some(("restart", restart_match)) => {
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
-            Ok(_) => println!("Page server is up and running"),
-            Err(err) => {
-                eprintln!("Page server is not available: {}", err);
-                exit(1);
-            }
-        },
-
-        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
-        None => bail!("no pageserver subcommand provided"),
+        Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
+        None => bail!("no attachment_service subcommand provided"),
    }
    Ok(())
 }
@@ -897,11 +989,23 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    broker::start_broker_process(env)?;

-    let pageserver = PageServerNode::from_env(env);
-    if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
-        try_stop_all(env, true);
-        exit(1);
+    // Only start the attachment service if the pageserver is configured to need it
+    if env.control_plane_api.is_some() {
+        let attachment_service = AttachmentService::from_env(env);
+        if let Err(e) = attachment_service.start() {
+            eprintln!("attachment_service start failed: {:#}", e);
+            try_stop_all(env, true);
+            exit(1);
+        }
+    }
+
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
+            try_stop_all(env, true);
+            exit(1);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -925,8 +1029,6 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
 }

 fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
-    let pageserver = PageServerNode::from_env(env);
-
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
@@ -941,8 +1043,11 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.stop(immediate) {
+            eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -955,6 +1060,13 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    if let Err(e) = broker::stop_broker_process(env) {
        eprintln!("neon broker stop failed: {e:#}");
    }
+
+    if env.control_plane_api.is_some() {
+        let attachment_service = AttachmentService::from_env(env);
+        if let Err(e) = attachment_service.stop(immediate) {
+            eprintln!("attachment service stop failed: {e:#}");
+        }
+    }
 }

 fn cli() -> Command {
@@ -969,6 +1081,16 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    // --id, when using a pageserver command
+    let pageserver_id_arg = Arg::new("pageserver-id")
+        .long("id")
+        .help("pageserver id")
+        .required(false);
+    // --pageserver-id when using a non-pageserver command
+    let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id")
+        .long("pageserver-id")
+        .required(false);
+
    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
        .short('e')
        .long("safekeeper-extra-opt")
@@ -1133,10 +1255,24 @@ fn cli() -> Command {
                .arg_required_else_help(true)
                .about("Manage pageserver")
                .subcommand(Command::new("status"))
+                .arg(pageserver_id_arg.clone())
+                .subcommand(Command::new("start").about("Start local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .arg(pageserver_id_arg.clone())
+                            .arg(stop_mode_arg.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
+        )
+        .subcommand(
+            Command::new("attachment_service")
+                .arg_required_else_help(true)
+                .about("Manage attachment_service")
                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
        )
        .subcommand(
            Command::new("safekeeper")
@@ -1172,6 +1308,7 @@ fn cli() -> Command {
                    .arg(lsn_arg.clone())
                    .arg(pg_port_arg.clone())
                    .arg(http_port_arg.clone())
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1189,6 +1326,7 @@ fn cli() -> Command {
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -70,6 +70,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
+    pageserver_id: NodeId,
 }

 //
@@ -82,19 +83,16 @@ pub struct ComputeControlPlane {
    pub endpoints: BTreeMap<String, Arc<Endpoint>>,

    env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
 }

 impl ComputeControlPlane {
    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        let pageserver = Arc::new(PageServerNode::from_env(&env));
-
        let mut endpoints = BTreeMap::default();
        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
+            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

@@ -102,7 +100,6 @@ impl ComputeControlPlane {
            base_port: 55431,
            endpoints,
            env,
-            pageserver,
        })
    }

@@ -125,20 +122,29 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
+        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let pageserver =
+            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver: Arc::clone(&self.pageserver),
+            pageserver,
            timeline_id,
            mode,
            tenant_id,
            pg_version,
-            skip_pg_catalog_updates: false,
+            // We don't setup roles and databases in the spec locally, so we don't need to
+            // do catalog updates. Catalog updates also include check availability
+            // data creation. Yet, we have tests that check that size and db dump
+            // before and after start are the same. So, skip catalog updates,
+            // with this we basically test a case of waking up an idle compute, where
+            // we also skip catalog updates in the cloud.
+            skip_pg_catalog_updates: true,
        });

        ep.create_endpoint_dir()?;
@@ -152,7 +158,8 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
-                skip_pg_catalog_updates: false,
+                skip_pg_catalog_updates: true,
+                pageserver_id,
            })?,
        )?;
        std::fs::write(
@@ -187,18 +194,14 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
+    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
-    fn from_dir_entry(
-        entry: std::fs::DirEntry,
-        env: &LocalEnv,
-        pageserver: &Arc<PageServerNode>,
-    ) -> Result<Endpoint> {
+    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
            anyhow::bail!(
                "Endpoint::from_dir_entry failed: '{}' is not a directory",
@@ -214,12 +217,15 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

+        let pageserver =
+            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
+
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver: Arc::clone(pageserver),
+            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -7,6 +7,7 @@
 // local installations.
 //

+pub mod attachment_service;
 mod background_process;
 pub mod broker;
 pub mod endpoint;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -68,11 +68,17 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    pub pageserver: PageServerConf,
+    /// This Vec must always contain at least one pageserver
+    pub pageservers: Vec<PageServerConf>,

    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,

+    // Control plane location: if None, we will not run attachment_service.  If set, this will
+    // be propagated into each pageserver's configuration.
+    #[serde(default)]
+    pub control_plane_api: Option<Url>,
+
    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
    #[serde(default)]
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
@@ -176,32 +182,28 @@ impl LocalEnv {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 => Ok(path.join(format!("v{pg_version}"))),
-            15 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    pub fn pageserver_bin(&self) -> PathBuf {
        self.neon_distrib_dir.join("pageserver")
    }

+    pub fn attachment_service_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("attachment_service")
+    }
+
    pub fn safekeeper_bin(&self) -> PathBuf {
        self.neon_distrib_dir.join("safekeeper")
    }
@@ -214,15 +216,23 @@ impl LocalEnv {
        self.base_data_dir.join("endpoints")
    }

-    // TODO: move pageserver files into ./pageserver
-    pub fn pageserver_data_dir(&self) -> PathBuf {
-        self.base_data_dir.clone()
+    pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf {
+        self.base_data_dir
+            .join(format!("pageserver_{pageserver_id}"))
    }

    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

+    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
+        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
+            Ok(conf)
+        } else {
+            bail!("could not find pageserver {id}")
+        }
+    }
+
    pub fn register_branch_mapping(
        &mut self,
        branch_name: String,
@@ -299,6 +309,10 @@ impl LocalEnv {
            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }

+        if env.pageservers.is_empty() {
+            anyhow::bail!("Configuration must contain at least one pageserver");
+        }
+
        env.base_data_dir = base_path();

        Ok(env)
@@ -331,7 +345,7 @@ impl LocalEnv {
        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
        // a bit sad.
-        let mut conf_content = r#"# This file describes a locale deployment of the page server
+        let mut conf_content = r#"# This file describes a local deployment of the page server
 # and safekeeeper node. It is read by the 'neon_local' command-line
 # utility.
 "#
@@ -461,9 +475,9 @@ impl LocalEnv {
    }

    fn auth_keys_needed(&self) -> bool {
-        self.pageserver.pg_auth_type == AuthType::NeonJWT
-            || self.pageserver.http_auth_type == AuthType::NeonJWT
-            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        self.pageservers.iter().any(|ps| {
+            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
+        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
    }
 }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -27,6 +27,7 @@ use utils::{
    lsn::Lsn,
 };

+use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
@@ -76,43 +77,40 @@ impl ResponseErrorMessageExt for Response {
 #[derive(Debug)]
 pub struct PageServerNode {
    pub pg_connection_config: PgConnectionConfig,
+    pub conf: PageServerConf,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
 }

 impl PageServerNode {
-    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
-            .expect("Unable to parse listen_pg_addr");
+    pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode {
+        let (host, port) =
+            parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);
        Self {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
+            conf: conf.clone(),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
+            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }

    // pageserver conf overrides defined by neon_local configuration.
    fn neon_local_overrides(&self) -> Vec<String> {
-        let id = format!("id={}", self.env.pageserver.id);
+        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let http_auth_type_param =
-            format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
-        let listen_http_addr_param = format!(
-            "listen_http_addr='{}'",
-            self.env.pageserver.listen_http_addr
-        );
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);

-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
-        let listen_pg_addr_param =
-            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -126,10 +124,18 @@ impl PageServerNode {
            broker_endpoint_param,
        ];

-        if self.env.pageserver.http_auth_type != AuthType::Trust
-            || self.env.pageserver.pg_auth_type != AuthType::Trust
+        if let Some(control_plane_api) = &self.env.control_plane_api {
+            overrides.push(format!(
+                "control_plane_api='{}'",
+                control_plane_api.as_str()
+            ));
+        }
+
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
-            overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
+            // Keys are generated in the toplevel repo dir, pageservers' workdirs
+            // are one level below that, so refer to keys with ../
+            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
        overrides
    }
@@ -137,16 +143,12 @@ impl PageServerNode {
    /// Initializes a pageserver node by creating its config with the overrides provided.
    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
-        self.pageserver_init(config_overrides).with_context(|| {
-            format!(
-                "Failed to run init for pageserver node {}",
-                self.env.pageserver.id,
-            )
-        })
+        self.pageserver_init(config_overrides)
+            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
    }

    pub fn repo_path(&self) -> PathBuf {
-        self.env.pageserver_data_dir()
+        self.env.pageserver_data_dir(self.conf.id)
    }

    /// The pid file is created by the pageserver process, with its pid stored inside.
@@ -162,7 +164,7 @@ impl PageServerNode {

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        let datadir = self.repo_path();
-        let node_id = self.env.pageserver.id;
+        let node_id = self.conf.id;
        println!(
            "Initializing pageserver node {} at '{}' in {:?}",
            node_id,
@@ -171,6 +173,10 @@ impl PageServerNode {
        );
        io::stdout().flush()?;

+        if !datadir.exists() {
+            std::fs::create_dir(&datadir)?;
+        }
+
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
        })?;
@@ -201,7 +207,7 @@ impl PageServerNode {
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
-            self.env.pageserver.id,
+            self.conf.id,
            self.pg_connection_config.raw_address(),
            datadir
        );
@@ -210,7 +216,7 @@ impl PageServerNode {
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!(
                "Cannot start pageserver node {} in path that has no string representation: {:?}",
-                self.env.pageserver.id, datadir,
+                self.conf.id, datadir,
            )
        })?;
        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
@@ -254,7 +260,7 @@ impl PageServerNode {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
        // the pageserver requires a token in incoming requests.
-        Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
+        Ok(if self.conf.http_auth_type != AuthType::Trust {
            // Generate a token to connect from the pageserver to a safekeeper
            let token = self
                .env
@@ -279,7 +285,7 @@ impl PageServerNode {

    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
        let mut config = self.pg_connection_config.clone();
-        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
+        if self.conf.pg_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -290,7 +296,7 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
        let mut builder = self.http_client.request(method, url);
-        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
+        if self.conf.http_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -316,7 +322,8 @@ impl PageServerNode {

    pub fn tenant_create(
        &self,
-        new_tenant_id: Option<TenantId>,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
        settings: HashMap<&str, &str>,
    ) -> anyhow::Result<TenantId> {
        let mut settings = settings.clone();
@@ -382,11 +389,9 @@ impl PageServerNode {
                .context("Failed to parse 'gc_feedback' as bool")?,
        };

-        // If tenant ID was not specified, generate one
-        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
-
        let request = models::TenantCreateRequest {
            new_tenant_id,
+            generation,
            config,
        };
        if !settings.is_empty() {
--- a/deny.toml
+++ b/deny.toml
@@ -4,7 +4,12 @@
 # to your expectations and requirements.

 # Root options
-targets = []
+targets = [
+    { triple = "x86_64-unknown-linux-gnu" },
+    { triple = "aarch64-unknown-linux-gnu" },
+    { triple = "aarch64-apple-darwin" },
+    { triple = "x86_64-apple-darwin" },
+]
 all-features = false
 no-default-features = false
 feature-depth = 1
@@ -18,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,7 +30,7 @@ cleanup() {
 echo "clean up containers if exists"
 cleanup

-for pg_version in 14 15; do
+for pg_version in 14 15 16; do
    echo "start containers (pg_version=$pg_version)."
    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

--- a/docs/rfcs/025-generation-numbers.md
+++ b/docs/rfcs/025-generation-numbers.md
@@ -0,0 +1,957 @@
+# Pageserver: split-brain safety for remote storage through generation numbers
+
+## Summary
+
+A scheme of logical "generation numbers" for tenant attachment to pageservers is proposed, along with
+changes to the remote storage format to include these generation numbers in S3 keys.
+
+Using the control plane as the issuer of these generation numbers enables strong anti-split-brain
+properties in the pageserver cluster without implementing a consensus mechanism directly
+in the pageservers.
+
+## Motivation
+
+Currently, the pageserver's remote storage format does not provide a mechanism for addressing
+split brain conditions that may happen when replacing a node or when migrating
+a tenant from one pageserver to another.
+
+From a remote storage perspective, a split brain condition occurs whenever two nodes both think
+they have the same tenant attached, and both can write to S3. This can happen in the case of a
+network partition, pathologically long delays (e.g. suspended VM), or software bugs.
+
+In the current deployment model, control plane guarantees that a tenant is attached to one
+pageserver at a time, thereby ruling out split-brain conditions resulting from dual
+attachment (however, there is always the risk of a control plane bug). This control
+plane guarantee prevents robust response to failures, as if a pageserver is unresponsive
+we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
+attach to a new, different pageserver even if an unresponsive pageserver may be running.
+
+Futher, lack of safety during split-brain conditions blocks two important features where occasional
+split-brain conditions are part of the design assumptions:
+
+- seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
+- automatic pageserver instance failure handling (aka "failover") (RFC TBD)
+
+### Prior art
+
+- 020-pageserver-s3-coordination.md
+- 023-the-state-of-pageserver-tenant-relocation.md
+- 026-pageserver-s3-mvcc.md
+
+This RFC has broad similarities to the proposal to implement a MVCC scheme in
+S3 object names, but this RFC avoids a general purpose transaction scheme in
+favour of more specialized "generations" that work like a transaction ID that
+always has the same lifetime as a pageserver process or tenant attachment, whichever
+is shorter.
+
+## Requirements
+
+- Accommodate storage backends with no atomic or fencing capability (i.e. work within
+  S3's limitation that there are no atomics and clients can't be fenced)
+- Don't depend on any STONITH or node fencing in the compute layer (i.e. we will not
+  assume that we can reliably kill and EC2 instance and have it die)
+- Scoped per-tenant, not per-pageserver; for _seamless tenant migration_, we need
+  per-tenant granularity, and for _failover_, we likely want to spread the workload
+  of the failed pageserver instance to a number of peers, rather than monolithically
+  moving the entire workload to another machine.
+  We do not rule out the latter case, but should not constrain ourselves to it.
+
+## Design Tenets
+
+These are not requirements, but are ideas that guide the following design:
+
+- Avoid implementing another consensus system: we already have a strongly consistent
+  database in the control plane that can do atomic operations where needed, and we also
+  have a Paxos implementation in the safekeeper.
+- Avoiding locking in to specific models of how failover will work (e.g. do not assume that
+  all the tenants on a pageserver will fail over as a unit).
+- Be strictly correct when it comes to data integrity. Occasional failures of availability
+  are tolerable, occasional data loss is not.
+
+## Non Goals
+
+The changes in this RFC intentionally isolate the design decision of how to define
+logical generations numbers and object storage format in a way that is somewhat flexible with
+respect to how actual orchestration of failover works.
+
+This RFC intentionally does not cover:
+
+- Failure detection
+- Orchestration of failover
+- Standby modes to keep data ready for fast migration
+- Intentional multi-writer operation on tenants (multi-writer scenarios are assumed to be transient split-brain situations).
+- Sharding.
+
+The interaction between this RFC and those features is discussed in [Appendix B](#appendix-b-interoperability-with-other-features)
+
+## Impacted Components
+
+pageserver, control plane, safekeeper (optional)
+
+## Implementation Part 1: Correctness
+
+### Summary
+
+- A per-tenant **generation number** is introduced to uniquely identifying tenant attachments to pageserver processes.
+
+  - This generation number increments each time the control plane modifies a tenant (`Project`)'s assigned pageserver, or when the assigned pageserver restarts.
+  - the control plane is the authority for generation numbers: only it may
+    increment a generation number.
+
+- **Object keys are suffixed** with the generation number
+- **Safety for multiply-attached tenants** is provided by the
+  generation number in the object key: the competing pageservers will not
+  try to write to the same keys.
+- **Safety in split brain for multiple nodes running with
+  the same node ID** is provided by the pageserver calling out to the control plane
+  on startup, to re-attach and thereby increment the generations of any attached tenants
+- **Safety for deletions** is achieved by deferring the DELETE from S3 to a point in time where the deleting node has validated with control plane that no attachment with a higher generation has a reference to the to-be-DELETEd key.
+- **The control plane is used to issue generation numbers** to avoid the need for
+  a built-in consensus system in the pageserver, although this could in principle
+  be changed without changing the storage format.
+
+### Generation numbers
+
+A generation number is associated with each tenant in the control plane,
+and each time the attachment status of the tenant changes, this is incremented.
+Changes in attachment status include:
+
+- Attaching the tenant to a different pageserver
+- A pageserver restarting, and "re-attaching" its tenants on startup
+
+These increments of attachment generation provide invariants we need to avoid
+split-brain issues in storage:
+
+- If two pageservers have the same tenant attached, the attachments are guaranteed to have different generation numbers, because the generation would increment
+  while attaching the second one.
+- If there are multiple pageservers running with the same node ID, all the attachments on all pageservers are guaranteed to have different generation numbers, because the generation would increment
+  when the second node started and re-attached its tenants.
+
+As long as the infrastructure does not transparently replace an underlying
+physical machine, we are totally safe. See the later [unsafe case](#unsafe-case-on-badly-behaved-infrastructure) section for details.
+
+### Object Key Changes
+
+#### Generation suffix
+
+All object keys (layer objects and index objects) will contain the attachment
+generation as a [suffix](#why-a-generation-suffix-rather-than-prefix).
+This suffix is the primary mechanism for protecting against split-brain situations, and
+enabling safe multi-attachment of tenants:
+
+- Two pageservers running with the same node ID (e.g. after a failure, where there is
+  some rogue pageserver still running) will not try to write to the same objects, because at startup they will have re-attached tenants and thereby incremented
+  generation numbers.
+- Multiple attachments (to different pageservers) of the same tenant will not try to write to the same objects, as each attachment would have a distinct generation.
+
+The generation is appended in hex format (8 byte string representing
+u32), to all our existing key names. A u32's range limit would permit
+27 restarts _per second_ over a 5 year system lifetime: orders of magnitude more than
+is realistic.
+
+The exact meaning of the generation suffix can evolve over time if necessary, for
+example if we chose to implement a failover mechanism internally to the pageservers
+rather than going via the control plane. The storage format just sees it as a number,
+with the only semantic property being that the highest numbered index is the latest.
+
+#### Index changes
+
+Since object keys now include a generation suffix, the index of these keys must also be updated. IndexPart currently stores keys and LSNs sufficient to reconstruct key names: this would be extended to store the generation as well.
+
+This will increase the size of the file, but only modestly: layers are already encoded as
+their string-ized form, so the overhead is about 10 bytes per layer. This will be less if/when
+the index storage format is migrated to a binary format from JSON.
+
+#### Visibility
+
+_This section doesn't describe code changes, but extends on the consequences of the
+object key changes given above_
+
+##### Visibility of objects to pageservers
+
+Pageservers can of course list objects in S3 at any time, but in practice their
+visible set is based on the contents of their LayerMap, which is initialized
+from the `index_part.json.???` that they load.
+
+Starting with the `index_part` from the most recent previous generation
+(see [loading index_part](#finding-the-remote-indices-for-timelines)), a pageserver
+initially has visibility of all the objects that were referenced in the loaded index.
+These objects are guaranteed to remain visible until the current generation is
+superseded, via pageservers in older generations avoiding deletions (see [deletion](#deletion)).
+
+The "most recent previous generation" is _not_ necessarily the most recent
+in terms of walltime, it is the one that is readable at the time a new generation
+starts. Consider the following sequence of a tenant being re-attached to different
+pageserver nodes:
+
+- Create + attach on PS1 in generation 1
+- PS1 Do some work, write out index_part.json-0001
+- Attach to PS2 in generation 2
+- Read index_part.json-0001
+- PS2 starts doing some work...
+- Attach to PS3 in generation 3
+- Read index_part.json-0001
+- **...PS2 finishes its work: now it writes index_part.json-0002**
+- PS3 writes out index_part.json-0003
+
+In the above sequence, the ancestry of indices is:
+
+```
+0001 -> 0002
+     |
+     -> 0003
+```
+
+This is not an issue for safety: if the 0002 references some object that is
+not in 0001, then 0003 simply does not see it, and will re-do whatever
+work was required (e.g. ingesting WAL or doing compaction). Objects referenced
+by only the 0002 index will never be read by future attachment generations, and
+will eventually be cleaned up by a scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)).
+
+##### Visibility of LSNs to clients
+
+Because index_part.json is now written with a generation suffix, which data
+is visible depends on which generation the reader is operating in:
+
+- If one was passively reading from S3 from outside of a pageserver, the
+  visibility of data would depend on which index_part.json-<generation> file
+  one had chosen to read from.
+- If two pageservers have the same tenant attached, they may have different
+  data visible as they're independently replaying the WAL, and maintaining
+  independent LayerMaps that are written to independent index_part.json files.
+  Data does not have to be remotely committed to be visible.
+- For a pageserver writing with a stale generation, historic LSNs
+  remain readable until another pageserver (with a higher generation suffix)
+  decides to execute GC deletions. At this point, we may think of the stale
+  attachment's generation as having logically ended: during its existence
+  the generation had a consistent view of the world.
+- For a newly attached pageserver, its highest visible LSN may appears to
+  go backwards with respect to an earlier attachment, if that earlier
+  attachment had not uploaded all data to S3 before the new attachment.
+
+### Deletion
+
+#### Generation number validation
+
+While writes are de-conflicted by writers always using their own generation number in the key,
+deletions are slightly more challenging: if a pageserver A is isolated, and the true active node is
+pageserver B, then it is dangerous for A to do any object deletions, even of objects that it wrote
+itself, because pageserver's B metadata might reference those objects.
+
+We solve this by inserting a "generation validation" step between the write of a remote index
+that un-links a particular object from the index, and the actual deletion of the object, such
+that deletions strictly obey the following ordering:
+
+1. Write out index_part.json: this guarantees that any subsequent reader of the metadata will
+   not try and read the object we unlinked.
+2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
+3. If step 2 passes, it is safe to delete the object. Why? The check-in with control plane
+   together with our visibility rules guarantees that any later generation
+   will use either the exact `index_part.json` that we uploaded in step 1, or a successor
+   of it; not an earlier one. In both cases, the `index_part.json` doesn't reference the
+   key we are deleting anymore, so, the key is invisible to any later attachment generation.
+   Hence it's safe to delete it.
+
+Note that at step 2 we are only confirming that deletions of objects _no longer referenced
+by the specific `index_part.json` written in step 1_ are safe. If we were attempting other deletions concurrently,
+these would need their own generation validation step.
+
+If step 2 fails, we may leak the object. This is safe, but has a cost: see [scrubbing](#cleaning-up-orphan-objects-scrubbing). We may avoid this entirely outside of node
+failures, if we do proper flushing of deletions on clean shutdown and clean migration.
+
+To avoid doing a huge number of control plane requests to perform generation validation,
+validation of many tenants will be done in a single request, and deletions will be queued up
+prior to validation: see [Persistent deletion queue](#persistent-deletion-queue) for more.
+
+#### `remote_consistent_lsn` updates
+
+Remote objects are not the only kind of deletion the pageserver does: it also indirectly deletes
+WAL data, by feeding back remote_consistent_lsn to safekeepers, as a signal to the safekeepers that
+they may drop data below this LSN.
+
+For the same reasons that deletion of objects must be guarded by an attachment generation number
+validation step, updates to `remote_consistent_lsn` are subject to the same rules, using
+an ordering as follows:
+
+1. upload the index_part that covers data up to LSN `L0` to S3
+2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
+3. advance the `remote_consistent_lsn` that we advertise to the safekeepers to `L0`
+
+If step 2 fails, then the `remote_consistent_lsn` advertised
+to safekeepers will not advance again until a pageserver
+with the latest generation is ready to do so.
+
+**Note:** at step 3 we are not advertising the _latest_ remote_consistent_lsn, we are
+advertising the value in the index_part that we uploaded in step 1. This provides
+a strong ordering guarantee.
+
+Internally to the pageserver, each timeline will have two remote_consistent_lsn values: the one that
+reflects its latest write to remote storage, and the one that reflects the most
+recent validation of generation number. It is only the latter value that may
+be advertised to the outside world (i.e. to the safekeeper).
+
+The control plane remains unaware of `remote_consistent_lsn`: it only has to validate
+the freshness of generation numbers, thereby granting the pageserver permission to
+share the information with the safekeeper.
+
+For convenience, in subsequent sections and RFCs we will use "deletion" to mean both deletion
+of objects in S3, and updates to the `remote_consistent_lsn`, as updates to the remote consistent
+LSN are de-facto deletions done via the safekeeper, and both kinds of deletion are subject to
+the same generation validation requirement.
+
+### Pageserver attach/startup changes
+
+#### Attachment
+
+Calls to `/v1/tenant/{tenant_id}/attach` are augmented with an additional
+`generation` field in the body.
+
+The pageserver does not persist this: a generation is only good for the lifetime
+of a process.
+
+#### Finding the remote indices for timelines
+
+Because index files are now suffixed with generation numbers, the pageserver
+cannot always GET the remote index in one request, because it can't always
+know a-priori what the latest remote index is.
+
+Typically, the most recent generation to write an index would be our own
+generation minus 1. However, this might not be the case: the previous
+node might have started and acquired a generation number, and then crashed
+before writing out a remote index.
+
+In the general case and as a fallback, the pageserver may list all the `index_part.json`
+files for a timeline, sort them by generation, and pick the highest that is `<=`
+its current generation for this attachment. The tenant should never load an index
+with an attachment generation _newer_ than its own.
+These two rules combined ensure that objects written by later generations are never visible to earlier generations.
+
+Note that if a given attachment picks an index part from an earlier generation (say n-2), but crashes & restarts before it writes its own generation's index part, next time it tries to pick an index part there may be an index part from generation n-1.
+It would pick the n-1 index part in that case, because it's sorted higher than the previous one from generation n-2.
+So, above rules guarantee no determinism in selecting the index part.
+are allowed to be attached with stale attachment generations during a multiply-attached
+phase in a migration, and in this instance if the old location's pageserver restarts,
+it should not try and load the newer generation's index.
+
+To summarize, on starting a timeline, the pageserver will:
+
+1. Issue a GET for index_part.json-<my generation - 1>
+2. If 1 failed, issue a ListObjectsv2 request for index_part.json\* and
+   pick the newest.
+
+One could optimize this further by using the control plane to record specifically
+which generation most recently wrote an index_part.json, if necessary, to increase
+the probability of finding the index_part.json in one GET. One could also improve
+the chances by having pageservers proactively write out index_part.json after they
+get a new generation ID.
+
+#### Re-attachment on startup
+
+On startup, the pageserver will call out to an new control plane `/re-attach`
+API (see [Generation API](#generation-api)). This returns a list of
+tenants that should be attached to the pageserver, and their generation numbers, which
+the control plane will increment before returning.
+
+The pageserver should still scan its local disk on startup, but should _delete_
+any local content for tenants not indicated in the `/re-attach` response: their
+absence is an implicit detach operation.
+
+**Note** if a tenant is omitted from the re-attach response, its local disk content
+will be deleted. This will change in subsequent work, when the control plane gains
+the concept of a secondary/standby location: a node with local content may revert
+to this status and retain some local content.
+
+#### Cleaning up previous generations' remote indices
+
+Deletion of old indices is not necessary for correctness, although it is necessary
+to avoid the ListObjects fallback in the previous section becoming ever more expensive.
+
+Once the new attachment has written out its index_part.json, it may asynchronously clean up historic index_part.json
+objects that were found.
+
+We may choose to implement this deletion either as an explicit step after we
+write out index_part for the first time in a pageserver's lifetime, or for
+simplicity just do it periodically as part of the background scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing));
+
+### Control Plane Changes
+
+#### Store generations for attaching tenants
+
+- The `Project` table must store the generation number for use when
+  attaching the tenant to a new pageserver.
+- The `/v1/tenant/:tenant_id/attach` pageserver API will require the generation number,
+  which the control plane can supply by simply incrementing the `Project`'s
+  generation number each time the tenant is attached to a different server: the same database
+  transaction that changes the assigned pageserver should also change the generation number.
+
+#### Generation API
+
+This section describes an API that could be provided directly by the control plane,
+or built as a separate microservice. In earlier parts of the RFC, when we
+discuss the control plane providing generation numbers, we are referring to this API.
+
+The API endpoints used by the pageserver to acquire and validate generation
+numbers are quite simple, and only require access to some persistent and
+linerizable storage (such as a database).
+
+Building this into the control plane is proposed as a least-effort option to exploit existing infrastructure and implement generation number issuance in the same transaction that mandates it (i.e., the transaction that updates the `Project` assignment to another pageserver).
+However, this is not mandatory: this "Generation Number Issuer" could
+be built as a microservice. In practice, we will write such a miniature service
+anyway, to enable E2E pageserver/compute testing without control plane.
+
+The endpoints required by pageservers are:
+
+##### `/re-attach`
+
+- Request: `{node_id: <u32>}`
+- Response:
+  - 200 `{tenants: [{id: <TenantId>, gen: <u32>}]}`
+  - 404: unknown node_id
+  - (Future: 429: flapping detected, perhaps nodes are fighting for the same node ID,
+    or perhaps this node was in a retry loop)
+  - (On unknown tenants, omit tenant from `tenants` array)
+- Server behavior: query database for which tenants should be attached to this pageserver.
+  - for each tenant that should be attached, increment the attachment generation and
+    include the new generation in the response
+- Client behavior:
+  - for all tenants in the response, activate with the new generation number
+  - for any local disk content _not_ referenced in the response, act as if we
+    had been asked to detach it (i.e. delete local files)
+
+**Note** the `node_id` in this request will change in future if we move to ephemeral
+node IDs, to be replaced with some correlation ID that helps the control plane realize
+if a process is running with the same storage as a previous pageserver process (e.g.
+we might use EC instance ID, or we might just write some UUID to the disk the first
+time we use it)
+
+##### `/validate`
+
+- Request: `{'tenants': [{tenant: <tenant id>, attach_gen: <gen>}, ...]}'`
+- Response:
+  - 200 `{'tenants': [{tenant: <tenant id>, status: <bool>}...]}`
+  - (On unknown tenants, omit tenant from `tenants` array)
+- Purpose: enable the pageserver to discover for the given attachments whether they are still the latest.
+- Server behavior: this is a read-only operation: simply compare the generations in the request with
+  the generations known to the server, and set status to `true` if they match.
+- Client behavior: clients must not do deletions within a tenant's remote data until they have
+  received a response indicating the generation they hold for the attachment is current.
+
+#### Use of `/load` and `/ignore` APIs
+
+Because the pageserver will be changed to only attach tenants on startup
+based on the control plane's response to a `/re-attach` request, the load/ignore
+APIs no longer make sense in their current form.
+
+The `/load` API becomes functionally equivalent to attach, and will be removed:
+any location that used `/load` before should just attach instead.
+
+The `/ignore` API is equivalent to detaching, but without deleting local files.
+
+### Timeline/Branch creation & deletion
+
+All of the previous arguments for safety have described operations within
+a timeline, where we may describe a sequence that includes updates to
+index_part.json, and where reads and writes are coming from a postgres
+endpoint (writes via the safekeeper).
+
+Creating or destroying timeline is a bit different, because writes
+are coming from the control plane.
+
+We must be safe against scenarios such as:
+
+- A tenant is attached to pageserver B while pageserver A is
+  in the middle of servicing an RPC from the control plane to
+  create or delete a tenant.
+- A pageserver A has been sent a timeline creation request
+  but becomes unresponsive. The tenant is attached to a
+  different pageserver B, and the timeline creation request
+  is sent there too.
+
+#### Timeline Creation
+
+If some very slow node tries to do a timeline creation _after_
+a more recent generation node has already created the timeline
+and written some data into it, that must not cause harm. This
+is provided in timeline creations by the way all the objects
+within the timeline's remote path include a generation suffix:
+a slow node in an old generation that attempts to "create" a timeline
+that already exists will just emit an index_part.json with
+an old generation suffix.
+
+Timeline IDs are never reused, so we don't have
+to worry about the case of create/delete/create cycles. If they
+were re-used during a disaster recovery "un-delete" of a timeline,
+that special case can be handled by calling out to all available pageservers
+to check that they return 404 for the timeline, and to flush their
+deletion queues in case they had any deletions pending from the
+timeline.
+
+The above makes it safe for control plane to change the assignment of
+tenant to pageserver in control plane while a timeline creation is ongoing.
+The reason is that the creation request against the new assigned pageserver
+uses a new generation number. However, care must be taken by control plane
+to ensure that a "timeline creation successul" response from some pageserver
+is checked for the pageserver's generation for that timeline's tenant still being the latest.
+If it is not the latest, the response does not constitute a successful timeline creation.
+It is acceptable to discard such responses, the scrubber will clean up the S3 state.
+It is better to issue a timelien deletion request to the stale attachment.
+
+#### Timeline Deletion
+
+Tenant/timeline deletion operations are exempt from generation validation
+on deletes, and therefore don't have to go through the same deletion
+queue as GC/compaction layer deletions. This is because once a
+delete is issued by the control plane, it is a promise that the
+control plane will keep trying until the deletion is done, so even stale
+pageservers are permitted to go ahead and delete the objects.
+
+The implications of this for control plane are:
+
+- During timeline/tenant deletion, the control plane must wait for the deletion to
+  be truly complete (status 404) and also handle the case where the pageserver
+  becomes unavailable, either by waiting for a replacement with the same node_id,
+  or by *re-attaching the tenant elsewhere.
+
+- The control plane must persist its intent to delete
+  a timeline/tenant before issuing any RPCs, and then once it starts, it must
+  keep retrying until the tenant/timeline is gone. This is already handled
+  by using a persistent `Operation` record that is retried indefinitely.
+
+Timeline deletion may result in a special kind of object leak, where
+the latest generation attachment completes a deletion (including erasing
+all objects in the timeline path), but some slow/partitioned node is
+writing into the timeline path with a stale generation number. This would
+not be caught by any per-timeline scrubbing (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)), since scrubbing happens on the
+attached pageserver, and once the timeline is deleted it isn't attached anywhere.
+This scenario should be pretty rare, and the control plane can make it even
+rarer by ensuring that if a tenant is in a multi-attached state (e.g. during
+migration), we wait for that to complete before processing the deletion. Beyond
+that, we may implement some other top-level scrub of timelines in
+an external tool, to identify any tenant/timeline paths that are not found
+in the control plane database.
+
+#### Examples
+
+- Deletion, node restarts partway through:
+  - By the time we returned 202, we have written a remote delete marker
+  - Any subsequent incarnation of the same node_id will see the remote
+    delete marker and continue to process the deletion
+  - If the original pageserver is lost permanently and no replacement
+    with the same node_id is available, then the control plane must recover
+    by re-attaching the tenant to a different node.
+- Creation, node becomes unresponsive partway through.
+  - Control plane will see HTTP request timeout, keep re-issuing
+    request to whoever is the latest attachment point for the tenant
+    until it succeeds.
+  - Stale nodes may be trying to execute timeline creation: they will
+    write out index_part.json files with
+    stale attachment generation: these will be eventually cleaned up
+    by the same mechanism as other old indices.
+
+### Unsafe case on badly behaved infrastructure
+
+This section is only relevant if running on a different environment
+than EC2 machines with ephemeral disks.
+
+If we ever run pageservers on infrastructure that might transparently restart
+a pageserver while leaving an old process running (e.g. a VM gets rescheduled
+without the old one being fenced), then there is a risk of corruption, when
+the control plane attaches the tenant, as follows:
+
+- If the control plane sends an `/attach` request to node A, then node A dies
+  and is replaced, and the control plane's retries the request without
+  incrementing that attachment ID, then it could end up with two physical nodes
+  both using the same generation number.
+- This is not an issue when using EC2 instances with ephemeral storage, as long
+  as the control plane never re-uses a node ID, but it would need re-examining
+  if running on different infrastructure.
+- To robustly protect against this class of issue, we would either:
+  - add a "node generation" to distinguish between different processes holding the
+    same node_id.
+  - or, dispense with static node_id entirely and issue an ephemeral ID to each
+    pageserver process when it starts.
+
+## Implementation Part 2: Optimizations
+
+### Persistent deletion queue
+
+Between writing our a new index_part.json that doesn't reference an object,
+and executing the deletion, an object passes through a window where it is
+only referenced in memory, and could be leaked if the pageserver is stopped
+uncleanly. That introduces conflicting incentives: on the one hand, we would
+like to delay and batch deletions to
+1. minimize the cost of the mandatory validations calls to control plane, and
+2. minimize cost for DeleteObjects requests.
+On the other hand we would also like to minimize leakage by executing
+deletions promptly.
+
+To resolve this, we may make the deletion queue persistent
+and then executing these in the background at a later time.
+
+_Note: The deletion queue's reason for existence is optimization rather than correctness,
+so there is a lot of flexibility in exactly how the it should work,
+as long as it obeys the rule to validate generations before executing deletions,
+so the following details are not essential to the overall RFC._
+
+#### Scope
+
+The deletion queue will be global per pageserver, not per-tenant. There
+are several reasons for this choice:
+
+- Use the queue as a central point to coalesce validation requests to the
+  control plane: this avoids individual `Timeline` objects ever touching
+  the control plane API, and avoids them having to know the rules about
+  validating deletions. This separation of concerns will avoid burdening
+  the already many-LoC `Timeline` type with even more responsibility.
+- Decouple the deletion queue from Tenant attachment lifetime: we may
+  "hibernate" an inactive tenant by tearing down its `Tenant`/`Timeline`
+  objects in the pageserver, without having to wait for deletions to be done.
+- Amortize the cost of I/O for the persistent queue, instead of having many
+  tiny queues.
+- Coalesce deletions into a smaller number of larger DeleteObjects calls
+
+Because of the cost of doing I/O for persistence, and the desire to coalesce
+generation validation requests across tenants, and coalesce deletions into
+larger DeleteObjects requests, there will be one deletion queue per pageserver
+rather than one per tenant. This has the added benefit that when deactivating
+a tenant, we do not have to drain their deletion queue: deletions can proceed
+for a tenant whose main `Tenant` object has been torn down.
+
+#### Flow of deletion
+
+The flow of a deletion is becomes:
+
+1. Need for deletion of an object (=> layer file) is identified.
+2. Unlink the object from all the places that reference it (=> `index_part.json`).
+3. Enqueue the deletion to a persistent queue.
+   Each entry is `tenant_id, attachment_generation, S3 key`.
+4. Validate & execute in batches:
+  4.1 For a batch of entries, call into control plane.
+  4.2 For the subset of entries that passed validation, execute a `DeleteObjects` S3 DELETE request for their S3 keys.
+
+As outlined in the Part 1 on correctness, it is critical that deletions are only
+executed once the key is not referenced anywhere in S3.
+This property is obviously upheld by the scheme above.
+
+#### We Accept Object Leakage In Acceptable Circumcstances
+
+If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
+Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
+This is acceptable for now, it can be caught by [the scrubber](#cleaning-up-orphan-objects-scrubbing).
+
+There are various measures we can take to improve this in the future.
+1. Cap amount of time until enqueued entry becomes durable (timeout for flush-to-tisk)
+2. Proactively flush:
+    - On graceful shutdown, as we anticipate that some or
+      all of our attachments may be re-assigned while we are offline.
+    - On tenant detach.
+3. For each entry, keep track of whether it has passed (2).
+   Only admit entries to (4) one they have passed (2).
+   This requires re-writing / two queue entries (intent, commit) per deletion.
+
+The important take-away with any of the above is that it's not
+disastrous to leak objects in exceptional circumstances.
+
+#### Operations that may skip the queue
+
+Deletions of an entire timeline are [exempt](#Timeline-Deletion) from generation number validation. Once the
+control plane sends the deletion request, there is no requirement to retain the readability
+of any data within the timeline, and all objects within the timeline path may be deleted
+at any time from the control plane's deletion request onwards.
+
+Since deletions of smaller timelines won't have enough objects to compose a full sized
+DeleteObjects request, it is still useful to send these through the last part of the
+deletion pipeline to coalesce with other executing deletions: to enable this, the
+deletion queue should expose two input channels: one for deletions that must be
+processed in a generation-aware way, and a fast path for timeline deletions, where
+that fast path may skip validation and the persistent queue.
+
+### Cleaning up orphan objects (scrubbing)
+
+An orphan object is any object which is no longer referenced by a running node or by metadata.
+
+Examples of how orphan objects arise:
+
+- A node PUTs a layer object, then crashes before it writes the
+  index_part.json that references that layer.
+- A stale node carries on running for some time, and writes out an unbounded number of
+  objects while it believes itself to be the rightful writer for a tenant.
+- A pageserver crashes between un-linking an object from the index, and persisting
+  the object to its deletion queue.
+
+Orphan objects are functionally harmless, but have a small cost due to S3 capacity consumed. We
+may clean them up at some time in the future, but doing a ListObjectsv2 operation and cross
+referencing with the latest metadata to identify objects which are not referenced.
+
+Scrubbing will be done only by an attached pageserver (not some third party process), and deletions requested during scrub will go through the same
+validation as all other deletions: the attachment generation must be
+fresh. This avoids the possibility of a stale pageserver incorrectly
+thinking than an object written by a newer generation is stale, and deleting
+it.
+
+It is not strictly necessary that scrubbing be done by an attached
+pageserver: it could also be done externally. However, an external
+scrubber would still require the same validation procedure that
+a pageserver's deletion queue performs, before actually erasing
+objects.
+
+## Operational impact
+
+### Availability
+
+Coordination of generation numbers via the control plane introduce a dependency for certain
+operations:
+
+1. Starting new pageservers (or activating pageservers after a restart)
+2. Executing enqueued deletions
+3. Advertising updated `remote_consistent_lsn` to enable WAL trimming
+
+Item 1. would mean that some in-place restarts that previously would have resumed service even if the control plane were
+unavailable, will now not resume service to users until the control plane is available. We could
+avoid this by having a timeout on communication with the control plane, and after some timeout,
+resume service with the previous generation numbers (assuming this was persisted to disk). However,
+this is unlikely to be needed as the control plane is already an essential & highly available component. Also, having a node re-use an old generation number would complicate
+reasoning about the system, as it would break the invariant that a generation number uniquely identifies
+a tenant's attachment to a given pageserver _process_: it would merely identify the tenant's attachment
+to the pageserver _machine_ or its _on-disk-state_.
+
+Item 2. is a non-issue operationally: it's harmless to delay deletions, the only impact of objects pending deletion is
+the S3 capacity cost.
+
+Item 3. could be an issue if safekeepers are low on disk space and the control plane is unavailable for a long time. If this became an issue,
+we could adjust the safekeeper to delete segments from local disk sooner, as soon as they're uploaded to S3, rather than waiting for
+remote_consistent_lsn to advance.
+
+For a managed service, the general approach should be to make sure we are monitoring & respond fast enough
+that control plane outages are bounded in time.
+
+There is also the fact that control plane runs in a single region.
+The latency for distant regions is not a big concern for us because all request types added by this RFC are either infrequent or not in the way of the data path.
+However, we lose region isolation for the operations listed above.
+The ongoing work to split console and control will give us per-region control plane, and all operations in this RFC can be handled by these per-region control planes.
+With that in mind, we accept the trade-offs outlined in this paragraph.
+
+We will also implement an "escape hatch" config generation numbers, where in a major disaster outage,
+we may manually run pageservers with a hand-selected generation number, so that we can bring them online
+independently of a control plane.
+
+### Rollout
+
+Although there is coupling between components, we may deploy most of the new data plane components
+independently of the control plane: initially they can just use a static generation number.
+
+#### Phase 1
+
+The pageserver is deployed with some special config to:
+
+- Always act like everything is generation 1 and do not wait for a control plane issued generation on attach
+- Skip the places in deletion and remote_consistent_lsn updates where we would call into control plane
+
+#### Phase 2
+
+The control plane changes are deployed: control plane will now track and increment generation numbers.
+
+#### Phase 3
+
+The pageserver is deployed with its control-plane-dependent changes enabled: it will now require
+the control plane to service re-attach requests on startup, and handle generation
+validation requests.
+
+### On-disk backward compatibility
+
+Backward compatibility with existing data is straightforward:
+
+- When reading the index, we may assume that any layer whose metadata doesn't include
+  generations will have a path without generation suffix.
+- When locating the index file on attachment, we may use the "fallback" listing path
+  and if there is only an index without generation suffix, that is the one we load.
+
+It is not necessary to re-write existing layers: even new index files will be able
+to represent generation-less layers.
+
+### On-disk forward compatibility
+
+We will do a two phase rollout, probably over multiple releases because we will naturally
+have some of the read-side code ready before the overall functionality is ready:
+
+1. Deploy pageservers which understand the new index format and generation suffixes
+   in keys, but do not write objects with generation numbers in the keys.
+2. Deploy pageservers that write objects with generation numbers in the keys.
+
+Old pageservers will be oblivious to generation numbers. That means that they can't
+read objects with generation numbers in the name. This is why we must
+first step must deploy the ability to read, before the second step
+starts writing them.
+
+# Frequently Asked Questions
+
+## Why a generation _suffix_ rather than _prefix_?
+
+The choice is motivated by object listing, since one can list by prefix but not
+suffix.
+
+In [finding remote indices](#finding-the-remote-indices-for-timelines), we rely
+on being able to do a prefix listing for `<tenant>/<timeline>/index_part.json*`.
+That relies on the prefix listing.
+
+The converse case of using a generation prefix and listing by generation is
+not needed: one could imagine listing by generation while scrubbing (so that
+a particular generation's layers could be scrubbed), but this is not part
+of normal operations, and the [scrubber](#cleaning-up-orphan-objects-scrubbing) probably won't work that way anyway.
+
+## Wouldn't it be simpler to have a separate deletion queue per timeline?
+
+Functionally speaking, we could. That's how RemoteTimelineClient currently works,
+but this approach does not map well to a long-lived persistent queue with
+generation validation.
+
+Anything we do per-timeline generates tiny random I/O, on a pageserver with
+tens of thousands of timelines operating: to be ready for high scale, we should:
+
+- A) Amortize costs where we can (e.g. a shared deletion queue)
+- B) Expect to put tenants into a quiescent state while they're not
+  busy: i.e. we shouldn't keep a tenant alive to service its deletion queue.
+
+This was discussed in the [scope](#scope) part of the deletion queue section.
+
+# Appendix A: Examples of use in high availability/failover
+
+The generation numbers proposed in this RFC are adaptable to a variety of different
+failover scenarios and models. The sections below sketch how they would work in practice.
+
+### In-place restart of a pageserver
+
+"In-place" here means that the restart is done before any other element in the system
+has taken action in response to the node being down.
+
+- After restart, the node issues a re-attach request to the control plane, and
+  receives new generation numbers for all its attached tenants.
+- Tenants may be activated with the generation number in the re-attach response.
+- If any of its attachments were in fact stale (i.e. had be reassigned to another
+  node while this node was offline), then
+  - the re-attach response will inform the tenant about this by not including
+    the tenant of this by _not_ incrementing the generation for that attachment.
+  - This will implicitly block deletions in the tenant, but as an optimization
+    the pageserver should also proactively stop doing S3 uploads when it notices this stale-generation state.
+  - The control plane is expected to eventually detach this tenant from the
+    pageserver.
+
+If the control plane does not include a tenant in the re-attach response,
+but there is still local state for the tenant in the filesystem, the pageserver
+deletes the local state in response and does not load/active the tenant.
+See the [earlier section on pageserver startup](#pageserver-attachstartup-changes) for details.
+Control plane can use this mechanism to clean up a pageserver that has been
+down for so long that all its tenants were migrated away before it came back
+up again and asked for re-attach.
+
+### Failure of a pageserver
+
+In this context, read "failure" as the most ambiguous possible case, where
+a pageserver is unavailable to clients and control plane, but may still be executing and talking
+to S3.
+
+#### Case A: re-attachment to other nodes
+
+1. Let's say node 0 becomes unresponsive in a cluster of three nodes 0, 1, 2.
+2. Some external mechanism notices that the node is unavailable and initiates
+   movement of all tenants attached to that node to a different node according
+   to some distribution rule.
+   In this example, it would mean incrementing the generation
+   of all tenants that were attached to node 0, as each tenant's assigned pageserver changes.
+3. A tenant which is now attached to node 1 will _also_ still be attached to node
+   0, from the perspective of node 0. Node 0 will still be using its old generation,
+   node 1 will be using a newer generation.
+4. S3 writes will continue from nodes 0 and 1: there will be an index_part.json-00000001
+   \_and\* an index_part.json-00000002. Objects written under the old suffix
+   after the new attachment was created do not matter from the rest of the system's
+   perspective: the endpoints are reading from the new attachment location. Objects
+   written by node 0 are just garbage that can be cleaned up at leisure. Node 0 will
+   not do any deletions because it can't synchronize with control plane, or if it could,
+   its deletion queue processing would get errors for the validation requests.
+
+#### Case B: direct node replacement with same node_id and drive
+
+This is the scenario we would experience if running pageservers in some dynamic
+VM/container environment that would auto-replace a given node_id when it became
+unresponsive, with the node's storage supplied by some network block device
+that is attached to the replacement VM/container.
+
+1. Let's say node 0 fails, and there may be some other peers but they aren't relevant.
+2. Some external mechanism notices that the node is unavailable, and creates
+   a "new node 0" (Node 0b) which is a physically separate server. The original node 0
+   (Node 0a) may still be running, because we do not assume the environment fences nodes.
+3. On startup, node 0b re-attaches and gets higher generation numbers for
+   all tenants.
+4. S3 writes continue from nodes 0a and 0b, but the writes do not collide due to different
+   generation in the suffix, and the writes from node 0a are not visible to the rest
+   of the system because endpoints are reading only from node 0b.
+
+# Appendix B: interoperability with other features
+
+## Sharded Keyspace
+
+The design in this RFC maps neatly to a sharded keyspace design where subsets of the key space
+for a tenant are assigned to different pageservers:
+
+- the "unit of work" for attachments becomes something like a TenantShard rather than a Tenant
+- TenantShards get generation numbers just as Tenants do.
+- Write workload (ingest, compaction) for a tenant is spread out across pageservers via
+  TenantShards, but each TenantShard still has exactly one valid writer at a time.
+
+## Read replicas
+
+_This section is about a passive reader of S3 pageserver state, not a postgres
+read replica_
+
+For historical reads to LSNs below the remote persistent LSN, any node may act as a reader at any
+time: remote data is logically immutable data, and the use of deferred deletion in this RFC helps
+mitigate the fact that remote data is not _physically_ immutable (i.e. the actual data for a given
+page moves around as compaction happens).
+
+A read replica needs to be aware of generations in remote data in order to read the latest
+metadata (find the index_part.json with the latest suffix). It may either query this
+from the control plane, or find it with ListObjectsv2 request
+
+## Seamless migration
+
+To make tenant migration totally seamless, we will probably want to intentionally double-attach
+a tenant briefly, serving reads from the old node while waiting for the new node to be ready.
+
+This RFC enables that double-attachment: two nodes may be attached at the same time, with the migration destination
+having a higher generation number. The old node will be able to ingest and serve reads, but not
+do any deletes. The new node's attachment must also avoid deleting layers that the old node may
+still use. A new piece of state
+will be needed for this in the control plane's definition of an attachment.
+
+## Warm secondary locations
+
+To enable faster tenant movement after a pageserver is lost, we will probably want to spend some
+disk capacity on keeping standby locations populated with local disk data.
+
+There's no conflict between this RFC and that: implementing warm secondary locations on a per-tenant basis
+would be a separate change to the control plane to store standby location(s) for a tenant. Because
+the standbys do not write to S3, they do not need to be assigned generation numbers. When a tenant is
+re-attached to a standby location, that would increment the tenant attachment generation and this
+would work the same as any other attachment change, but with a warm cache.
+
+## Ephemeral node IDs
+
+This RFC intentionally avoids changing anything fundamental about how pageservers are identified
+and registered with the control plane, to avoid coupling the implementation of pageserver split
+brain protection with more fundamental changes in the management of the pageservers.
+
+Moving to ephemeral node IDs would provide an extra layer of
+resilience in the system, as it would prevent the control plane
+accidentally attaching to two physical nodes with the same
+generation, if somehow there were two physical nodes with
+the same node IDs (currently we rely on EC2 guarantees to
+eliminate this scenario). With ephemeral node IDs, there would be
+no possibility of that happening, no matter the behavior of
+underlying infrastructure.
+
+Nothing fundamental in the pageserver's handling of generations needs to change to handle ephemeral node IDs, since we hardly use the
+`node_id` anywhere. The `/re-attach` API would be extended
+to enable the pageserver to obtain its ephemeral ID, and provide
+some correlation identifier (e.g. EC instance ID), to help the
+control plane re-attach tenants to the same physical server that
+previously had them attached.
--- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
+++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
@@ -0,0 +1,281 @@
+
+# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
+
+* Created on: Aug 23, 2023
+* Author: Christian Schwarz
+
+## Summary
+
+This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
+Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
+
+## Motivation
+
+### Background
+
+We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
+If we crash or restart pageserver, we reconstruct the layer map from:
+1. local timeline directory contents
+2. remote `index_part.json` contents.
+
+The function that is responsible for this is called `Timeline::load_layer_map()`.
+The reconciliation process's behavior is the following:
+* local-only files will become part of the layer map as local-only layers and rescheduled for upload
+* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
+
+### The Problem
+
+There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
+The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
+As stated above, making the update to the layer map in atomic way is trivial.
+But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
+Currently, we issue the system calls one by one and hope we don't crash.
+
+What happens if we crash and restart in the middle of that system call sequence?
+We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
+
+We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
+
+### Problem's Implications For Compaction
+
+The implications of the above are primarily problematic for compaction.
+Specifically, the part of it that compacts L0 layers into L1 layers.
+
+Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
+Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
+It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
+
+If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
+This means the compaction after restart will **overwrite** the previously written L1s.
+Currently we also schedule an S3 upload of the overwritten L1.
+
+If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
+
+*However*:
+1. the file size of the overwritten L1s may not be identical, and
+2. the bit pattern of the overwritten L1s may not be identical, and,
+3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
+
+The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
+
+For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
+But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
+That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
+If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
+Effectively, the data in the L1 has become inaccessible to node B.
+If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
+
+If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
+
+In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
+
+But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
+
+Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
+**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
+
+## Design
+
+Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
+Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
+
+During **timeline load**, the only thing that matters is the remote index part content.
+Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
+The local timeline dir's `metadata` file does not matter.
+The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
+Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
+The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
+Instead, it treats the remote index part as the authoritative layer map.
+If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
+If it doesn't match, we remove the file from the local timeline dir.
+
+After load, **at runtime**, nothing changes compared to what we did before this RFC.
+The procedure for single- and multi-object changes is reproduced here for reference:
+* For any new layers that the change adds:
+  * Write them to a temporary location.
+  * While holding layer map lock:
+    * Move them to the final location.
+    * Insert into layer map.
+* Make the S3 changes.
+  We won't reproduce the remote timeline client method calls here because these are subject to change.
+  Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
+    * PUT layer files inserted by the change.
+    * PUT an index part that has insertions and deletions of the change.
+    * DELETE the layer files that are deleted by the change.
+
+Note that it is safe for the DELETE to be deferred arbitrarily.
+* If it never happens, we leak the object, but, that's not a correctness concern.
+* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
+* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
+
+## How This Solves The Problem
+
+If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
+The S3 change sequence above is obviously crash-consistent.
+If we crash before the index part PUT, then we leak the inserted layer files to S3.
+If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
+Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
+
+Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
+* atomic layer map update at runtime, currently by using an RwLock in write mode
+* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
+* local timeline dir state:
+  * irrelevant for layer map content => irrelevant for atomic updates / crash consistency
+  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
+  * if we crash before index part PUT, local layer files will be deleted
+
+## Trade-Offs
+
+### Fundamental
+
+If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
+* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
+* compaction: we lose the entire compaction iteration work; need to re-do it again
+* gc: no change to what we have today
+
+If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
+The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
+Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
+* on-demand downloads that were needed to do the work: are likely still present, not lost
+* wal ingest: currently unbounded
+* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
+  * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
+  * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
+* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
+  * I have no intuition how expensive / long-running it is in reality.
+* gc: `update_gc_info`` work (not substantial, AFAIK)
+
+To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
+However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
+We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
+However, this RFC is not constraining the design space either.
+
+### Practical
+
+#### Pageserver Restarts
+
+Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
+However, regular pageserver restart happen frequently, e.g., during weekly deploys.
+
+In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
+They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
+We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
+A longer budget would expose tenants that are done early to a longer downtime.
+A short budget would risk throwing away more work that'd have to be re-done after restart.
+
+In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
+We can mitigate this problem as follows:
+0. initially, by accepting that we need to do the work again
+1. short-term, introducing measures to cap the amount of in-flight work:
+
+   - cap upload queue length, use backpressure to slow down compaction
+   - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
+   - introducing a read-only shutdown state for tenants that are fast to shut down;
+     that state would be equivalent to the state of a tenant in hot standby / readonly mode.
+
+2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
+
+#### `disk_consistent_lsn` can go backwards
+
+`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
+Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
+Compute certainly doesn't care about `disk_consistent_lsn`.
+
+
+## Side-Effects Of This Design
+
+* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
+
+## Limitations
+
+Multi-object changes that span multiple timelines aren't covered by this RFC.
+That's fine because we currently don't need them, as evidenced by the absence
+of a Pageserver operation that holds multiple timelines' layer map lock at a time.
+
+## Impacted components
+
+Primarily pageservers.
+
+Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
+No changes to safekeepers are needed.
+
+## Alternatives considered
+
+### Alternative 1: WAL
+
+We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
+The WAL would be used to
+1. make multi-object changes atomic
+2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
+
+The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
+* New on-disk state to get right.
+* Forward- and backward-compatibility development costs in the future.
+
+### Alternative 2: Flow Everything Through `index_part.json`
+
+We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
+I.e., layer map would always be the last-persisted S3 state.
+That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
+And it might make hot standbys / read-only pageservers less of a special case in the future.
+
+But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
+
+And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
+
+Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
+
+### Alternative 3: Sequence Numbers For Layers
+
+Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
+
+```
+# before
+tenants/$tenant/timelines/$timeline/$key_and_lsn_range
+# after
+tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
+```
+
+To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
+
+This alternative does not solve atomic layer map updates.
+In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
+In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
+We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
+
+However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
+
+So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
+But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
+The proposed design in this RFC addresses both.
+
+So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
+That way, we avoid a phase where the crash-during-compaction problem is accute.
+
+## Related issues
+
+- https://github.com/neondatabase/neon/issues/4749
+- https://github.com/neondatabase/neon/issues/4418
+  - https://github.com/neondatabase/neon/pull/4422
+- https://github.com/neondatabase/neon/issues/5077
+- https://github.com/neondatabase/neon/issues/4088
+  - (re)resolutions:
+    - https://github.com/neondatabase/neon/pull/4696
+    - https://github.com/neondatabase/neon/pull/4094
+      - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
+
+Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that  assumption in order to fix the problem.
+
+
+## Implementation Plan
+
+1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
+
+    - The nasty part here is to fix all the tests that fiddle with the local timeline directory.
+      Possibly they are just irrelevant with this change, but, each case will require inspection.
+
+2. Implement the design above.
+
+    - Initially, ship without the mitigations for restart and accept we will do some work twice.
+    - Measure the impact and implement one of the mitigations.
+
--- a/docs/rfcs/028-pageserver-migration.md
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -0,0 +1,599 @@
+# Seamless tenant migration
+
+- Author: john@neon.tech
+- Created on 2023-08-11
+- Implemented on ..
+
+## Summary
+
+The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant
+migration safe". Following that,
+this RFC is about how those migrations are to be done:
+
+1. Seamlessly (without interruption to client availability)
+2. Quickly (enabling faster operations)
+3. Efficiently (minimizing I/O and $ cost)
+
+These points are in priority order: if we have to sacrifice
+efficiency to make a migration seamless for clients, we will
+do so, etc.
+
+This is accomplished by introducing two high level changes:
+
+- A dual-attached state for tenants, used in a control-plane-orchestrated
+  migration procedure that preserves availability during a migration.
+- Warm secondary locations for tenants, where on-disk content is primed
+  for a fast migration of the tenant from its current attachment to this
+  secondary location.
+
+## Motivation
+
+Migrating tenants between pageservers is essential to operating a service
+at scale, in several contexts:
+
+1. Responding to a pageserver node failure by migrating tenants to other pageservers
+2. Balancing load and capacity across pageservers, for example when a user expands their
+   database and they need to migrate to a pageserver with more capacity.
+3. Restarting pageservers for upgrades and maintenance
+
+The current situation steps for migration are:
+
+- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)).
+- attach to new node
+- re-configure endpoints to use the new node
+
+Once [generation numbers](025-generation-numbers.md) are implemented,
+the detach step is no longer critical for correctness. So, we can
+
+- attach to a new node,
+- re-configure endpoints to use the new node, and then
+- detach from the old node.
+
+However, this still does not meet our seamless/fast/efficient goals:
+
+- Not fast: The new node will have to download potentially large amounts
+  of data from S3, which may take many minutes.
+- Not seamless: If we attach to a new pageserver before detaching an old one,
+  the new one might delete some objects that interrupt availability of reads on the old one.
+- Not efficient: the old pageserver will continue uploading
+  S3 content during the migration that will never be read.
+
+The user expectations for availability are:
+
+- For planned maintenance, there should be zero availability
+  gap. This expectation is fulfilled by this RFC.
+- For unplanned changes (e.g. node failures), there should be
+  minimal availability gap. This RFC provides the _mechanism_
+  to fail over quickly, but does not provide the failure _detection_
+  nor failover _policy_.
+
+## Non Goals
+
+- Defining service tiers with different storage strategies: the same
+  level of HA & overhead will apply to all tenants. This doesn't rule out
+  adding such tiers in future.
+- Enabling pageserver failover in the absence of a control plane: the control
+  plane will remain the source of truth for what should be attached where.
+- Totally avoiding availability gaps on unplanned migrations during
+  a failure (we expect a small, bounded window of
+  read unavailability of very recent LSNs)
+- Workload balancing: this RFC defines the mechanism for moving tenants
+  around, not the higher level logic for deciding who goes where.
+- Defining all possible configuration flows for tenants: the migration process
+  defined in this RFC demonstrates the sufficiency of the pageserver API, but
+  is not the only kind of configuration change the control plane will ever do.
+  The APIs defined here should let the control plane move tenants around in
+  whatever way is needed while preserving data safety and read availability.
+
+## Impacted components
+
+Pageserver, control plane
+
+## Terminology
+
+- **Attachment**: a tenant is _attached_ to a pageserver if it has
+  been issued a generation number, and is running an instance of
+  the `Tenant` type, ingesting the WAL, and available to serve
+  page reads.
+- **Location**: locations are a superset of attachments. A location
+  is a combination of a tenant and a pageserver. We may _attach_ at a _location_.
+
+- **Secondary location**: a location which is not currently attached.
+- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features.
+
+## Implementation (high level)
+
+### Warm secondary locations
+
+To enable faster migrations, we will identify at least one _secondary location_
+for each tenant. This secondary location will keep a warm cache of layers
+for the tenant, so that if it is later attached, it can catch up with the
+latest LSN quickly: rather than downloading everything, it only has to replay
+the recent part of the WAL to advance from the remote_consistent_offset to the
+most recent LSN in the WAL.
+
+The control plane is responsible for selecting secondary locations, and
+calling into pageservers to configure tenants into a secondary mode at this
+new location, as well as attaching the tenant in its existing primary location.
+
+The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap)
+to advise secondaries of which layers should be downloaded.
+
+### Location modes
+
+Currently, we consider a tenant to be in one of two states on a pageserver:
+
+- Attached: active `Tenant` object, and layers on local disk
+- Detached: no layers on local disk, no runtime state.
+
+We will extend this with finer-grained modes, whose purpose will become
+clear in later sections:
+
+- **AttachedSingle**: equivalent the existing attached state.
+- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but
+  does not do deletions.
+- **AttachedStale**: like AttachedSingle, holds a stale generation,
+  do not do any remote storage operations.
+- **Secondary**: keep local state on disk, periodically update from S3.
+- **Detached**: equivalent to existing detached state.
+
+To control these finer grained states, a new pageserver API endpoint will be added.
+
+### Cutover procedure
+
+Define old location and new location as "Node A" and "Node B". Consider
+the case where both nodes are available, and Node B was previously configured
+as a secondary location for the tenant we are migrating.
+
+The cutover procedure is orchestrated by the control plane, calling into
+the pageservers' APIs:
+
+1. Call to Node A requesting it to flush to S3 and enter AttachedStale state
+2. Increment generation, and call to Node B requesting it to enter AttachedMulti
+   state with the new generation.
+3. Call to Node B, requesting it to download the latest hot layers from remote storage,
+   according to the latest heatmap flushed by Node A.
+4. Wait for Node B's WAL ingestion to catch up with node A's
+5. Update endpoints to use node B instead of node A
+6. Call to node B requesting it to enter state AttachedSingle.
+7. Call to node A requesting it to enter state Secondary
+
+The following table summarizes how the state of the system advances:
+
+|     Step      |     Node A     |     Node B     | Node used by endpoints |
+| :-----------: | :------------: | :------------: | :--------------------: |
+| 1 (_initial_) | AttachedSingle |   Secondary    |           A            |
+|       2       | AttachedStale  | AttachedMulti  |           A            |
+|       3       | AttachedStale  | AttachedMulti  |           A            |
+|       4       | AttachedStale  | AttachedMulti  |           A            |
+| 5 (_cutover_) | AttachedStale  | AttachedMulti  |           B            |
+|       6       | AttachedStale  | AttachedSingle |           B            |
+|  7 (_final_)  |   Secondary    | AttachedSingle |           B            |
+
+The procedure described for a clean handover from a live node to a secondary
+is also used for failure cases and for migrations to a location that is not
+configured as a secondary, by simply skipping irrelevant steps, as described in
+the following sections.
+
+#### Migration from an unresponsive node
+
+If node A is unavailable, then all calls into
+node A are skipped and we don't wait for B to catch up before
+switching updating the endpoints to use B.
+
+#### Migration to a location that is not a secondary
+
+If node B is initially in Detached state, the procedure is identical. Since Node B
+is coming from a Detached state rather than Secondary, the download of layers and
+catch up with WAL will take much longer.
+
+We might do this if:
+
+- Attached and secondary locations are both critically low on disk, and we need
+  to migrate to a third node with more resources available.
+- We are migrating a tenant which does not use secondary locations to save on cost.
+
+#### Permanent migration away from a node
+
+In the final step of the migration, we generally request the original node to enter a Secondary
+state. This is typical if we are doing a planned migration during maintenance, or to
+balance CPU/network load away from a node.
+
+One might also want to permanently migrate away: this can be done by simply removing the secondary
+location after the migration is complete, or as an optimization by substituting the Detached state
+for the Secondary state in the final step.
+
+#### Cutover diagram
+
+```mermaid
+sequenceDiagram
+participant CP as Control plane
+participant A as Node A
+participant B as Node B
+participant E as Endpoint
+
+CP->>A: PUT Flush & go to AttachedStale
+note right of A: A continues to ingest WAL
+CP->>B: PUT AttachedMulti
+CP->>B: PUT Download layers from latest heatmap
+note right of B: B downloads from S3
+loop Poll until download complete
+CP->>B: GET download status
+end
+activate B
+note right of B: B ingests WAL
+loop Poll until catch up
+CP->>B: GET visible WAL
+CP->>A: GET visible WAL
+end
+deactivate B
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+CP->>B: PUT AttachedSingle
+CP->>A: PUT Secondary
+```
+
+#### Cutover from an unavailable pageserver
+
+This case is far simpler: we may skip straight to our intended
+end state.
+
+```mermaid
+sequenceDiagram
+participant A as Node A
+participant CP as Control plane
+participant B as Node B
+participant E as Endpoint
+
+note right of A: Node A offline
+activate A
+CP->>B: PUT AttachedSingle
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+deactivate A
+```
+
+## Implementation (detail)
+
+### Purpose of AttachedMulti, AttachedStale
+
+#### AttachedMulti
+
+Ordinarily, an attached pageserver whose generation is the latest may delete
+layers at will (e.g. during compaction). If a previous generation pageserver
+is also still attached, and in use by endpoints, then this layer deletion could
+lead to a loss of availability for the endpoint when reading from the previous
+generation pageserver.
+
+The _AttachedMulti_ state simply disables deletions. These will be enqueued
+in `RemoteTimelineClient` until the control plane transitions the
+node into AttachedSingle, which unblocks deletions.  Other remote storage operations
+such as uploads are not blocked.
+
+AttachedMulti is not required for data safety, only to preserve availability
+on pageservers running with stale generations.
+
+A node enters AttachedMulti only when explicitly asked to by the control plane. It should
+only remain in this state for the duration of a migration.
+
+If a control plane bug leaves
+the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued
+deletions. This may be accomplished simply, by dropping enqueued deletions when some modest
+threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions,
+it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or
+by timeline deletion.
+
+During AttachedMulti, the Tenant is free to drop layers from local disk in response to
+disk pressure: only the deletion of remote layers is blocked.
+
+#### AttachedStale
+
+Currently, a pageserver with a stale generation number will continue to
+upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation
+will not be read back by future generations of pageservers.
+
+The _AttachedStale_ state disables S3 uploads. The stale pageserver
+will continue to ingest the WAL and write layers to local disk, but not to
+do any uploads to S3.
+
+A node may enter AttachedStale in two ways:
+
+- Explicitly, when control plane calls into the node at the start of a migration.
+- Implicitly, when the node tries to validate some deletions and discovers
+  that its generation is stale.
+
+The AttachedStale state also disables sending consumption metrics from
+that location: it is interpreted as an indication that some other pageserver
+is already attached or is about to be attached, and that new pageserver will
+be responsible for sending consumption metrics.
+
+#### Disk Pressure & AttachedStale
+
+Over long periods of time, a tenant location in AttachedStale will accumulate data
+on local disk, as it cannot evict any layers written since it entered the
+AttachStale state. We rely on the control plane to revert the location to
+Secondary or Detached at the end of a migration.
+
+This scenario is particularly noteworthy when evacuating all tenants on a pageserver:
+since _all_ the attached tenants will go into AttachedStale, we will be doing no
+uploads at all, therefore ingested data will cause disk usage to increase continuously.
+Under nominal conditions, the available disk space on pageservers should be sufficient
+to complete the evacuation before this becomes a problem, but we must also handle
+the case where we hit a low disk situation while in this state.
+
+The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task`
+touches each Tenant when it determines that a low-disk condition requires
+some layer eviction. Having selected layers for eviction, the eviction
+task calls `Timeline::evict_layers`.
+
+**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted
+layers are not yet uploaded to S3, then the block on uploads will be lifted. This
+will result in leaking some objects once a migration is complete, but will enable
+the node to manage its disk space properly: if a node is left with some tenants
+in AttachedStale indefinitely due to a network partition or control plane bug,
+these tenants will not cause a full disk condition.
+
+### Warm secondary updates
+
+#### Layer heatmap
+
+The secondary location's job is to serve reads **with the same quality of service as the original location
+was serving them around the time of a migration**. This does not mean the secondary
+location needs the whole set of layers: inactive layers that might soon
+be evicted on the attached pageserver need not be downloaded by the
+secondary. A totally idle tenant only needs to maintain enough on-disk
+state to enable a fast cold start (i.e. the most recent image layers are
+typically sufficient).
+
+To enable this, we introduce the concept of a _layer heatmap_, which
+acts as an advisory input to secondary locations to decide which
+layers to download from S3.
+
+#### Attached pageserver
+
+The attached pageserver, if in state AttachedSingle, periodically
+uploads a serialized heat map to S3. It may skip this if there
+is no change since the last time it uploaded (e.g. if the tenant
+is totally idle).
+
+Additionally, when the tenant is flushed to remote storage prior to a migration
+(the first step in [cutover procedure](#cutover-procedure)), 
+the heatmap is written out. This enables a future attached pageserver
+to get an up to date view when deciding which layers to download.
+
+#### Secondary location behavior
+
+Secondary warm locations run a simple loop, implemented separately from
+the main `Tenant` type, which represents attached tenants:
+
+- Download the layer heatmap
+- Select any "hot enough" layers to download, if there is sufficient
+  free disk space.
+- Download layers, if they were not previously evicted (see below)
+- Download the latest index_part.json
+- Check if any layers currently on disk are no longer referenced by
+  IndexPart & delete them
+
+Note that the heatmap is only advisory: if a secondary location has plenty
+of disk space, it may choose to retain layers that aren't referenced
+by the heatmap, as long as they are still referenced by the IndexPart. Conversely,
+if a node is very low on disk space, it might opt to raise the heat threshold required
+to both downloading a layer, until more disk space is available.
+
+#### Secondary locations & disk pressure
+
+Secondary locations are subject to eviction on disk pressure, just as
+attached locations are.  For eviction purposes, the access time of a
+layer in a secondary location will be the access time given in the heatmap,
+rather than the literal time at which the local layer file was accessed.
+
+The heatmap will indicate which layers are in local storage on the attached
+location.  The secondary will always attempt to get back to having that
+set of layers on disk, but to avoid flapping, it will remember the access
+time of the layer it was most recently asked to evict, and layers whose
+access time is below that will not be re-downloaded.
+
+The resulting behavior is that after a layer is evicted from a secondary
+location, it is only re-downloaded once the attached pageserver accesses
+the layer and uploads a heatmap reflecting that access time.  On a pageserver
+restart, the secondary location will attempt to download all layers in
+the heatmap again, if they are not on local disk.
+
+This behavior will be slightly different when secondary locations are
+used for "low energy tenants", but that is beyond the scope of this RFC.
+
+### Location configuration API
+
+Currently, the `/tenant/<tenant_id>/config` API defines various
+tunables like compaction settings, which apply to the tenant irrespective
+of which pageserver it is running on.
+
+A new "location config" structure will be introduced, which defines
+configuration which is per-tenant, but local to a particular pageserver,
+such as the attachment mode and whether it is a secondary.
+
+The pageserver will expose a new per-tenant API for setting
+the state: `/tenant/<tenant_id>/location/config`.
+
+Body content:
+
+```
+{
+  state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}',
+  generation: Option<u32>,
+  configuration: `Option<TenantConfig>`
+  flush: bool
+}
+```
+
+Existing `/attach` and `/detach` endpoint will have the same
+behavior as calling `/location/config` with `AttachedSingle` and `Detached`
+states respectively. These endpoints will be deprecated and later
+removed.
+
+The generation attribute is mandatory for entering `AttachedSingle` or
+`AttachedMulti`.
+
+The configuration attribute is mandatory when entering any state other
+than `Detached`. This configuration is the same as the body for
+the existing `/tenant/<tenant_id>/config` endpoint.
+
+The `flush` argument indicates whether the pageservers should flush
+to S3 before proceeding: this only has any effect if the node is
+currently in AttachedSingle or AttachedMulti. This is used
+during the first phase of migration, when transitioning the
+old pageserver to AttachedSingle.
+
+The `/re-attach` API response will be extended to include a `state` as
+well as a `generation`, enabling the pageserver to enter the
+correct state for each tenant on startup.
+
+### Database schema for locations
+
+A new table `ProjectLocation`:
+
+- pageserver_id: int
+- tenant_id: TenantId
+- generation: Option<int>
+- state: `enum(Secondary, AttachedSingle, AttachedMulti)`
+
+Notes:
+
+- It is legacy for a Project to have zero `ProjectLocation`s
+- The `pageserver` column in `Project` now means "to which pageserver should
+  endpoints connect", rather than simply which pageserver is attached.
+- The `generation` column in `Project` remains, and is incremented and used
+  to set the generation of `ProjectLocation` rows when they are set into
+  an attached state.
+- The `Detached` state is implicitly represented as the absence of
+  a `ProjectLocation`.
+
+### Executing migrations
+
+Migrations will be implemented as Go functions, within the
+existing `Operation` framework in the control plane. These
+operations are persistent, such that they will always keep
+trying until completion: this property is important to avoid
+leaving garbage behind on pageservers, such as AttachedStale
+locations.
+
+### Recovery from failures during migration
+
+During migration, the control plane may encounter failures of either
+the original or new pageserver, or both:
+
+- If the original fails, skip past waiting for the new pageserver
+  to catch up, and put it into AttachedSingle immediately.
+- If the new node fails, put the old pageserver into Secondary
+  and then back into AttachedSingle (this has the effect of
+  retaining on-disk state and granting it a fresh generation number).
+- If both nodes fail, keep trying until one of them is available
+  again.
+
+### Control plane -> Pageserver reconciliation
+
+A migration may be done while the old node is unavailable,
+in which case the old node may still be running in an AttachedStale
+state.
+
+In this case, it is undesirable to have the migration `Operation`
+stay alive until the old node eventually comes back online
+and can be cleaned up. To handle this, the control plane
+should run a background reconciliation process to compare
+a pageserver's attachments with the database, and clean up
+any that shouldn't be there any more.
+
+Note that there will be no work to do if the old node was really
+offline, as during startup it will call into `/re-attach` and
+be updated that way. The reconciliation will only be needed
+if the node was unavailable but still running.
+
+## Alternatives considered
+
+### Only enabling secondary locations for tenants on a higher service tier
+
+This will make sense in future, especially for tiny databases that may be
+downloaded from S3 in milliseconds when needed.
+
+However, it is not wise to do it immediately, because pageservers contain
+a mixture of higher and lower tier workloads. If we had 1 tenant with
+a secondary location and 9 without, then those other 9 tenants will do
+a lot of I/O as they try to recover from S3, which may degrade the
+service of the tenant which had a secondary location.
+
+Until we segregate tenant on different service tiers on different pageserver
+nodes, or implement & test QoS to ensure that tenants with secondaries are
+not harmed by tenants without, we should use the same failover approach
+for all the tenants.
+
+### Hot secondary locations (continuous WAL replay)
+
+Instead of secondary locations populating their caches from S3, we could
+have them consume the WAL from safekeepers. The downsides of this would be:
+
+- Double load on safekeepers, which are a less scalable service than S3
+- Secondary locations' on-disk state would end up subtly different to
+  the remote state, which would make synchronizing with S3 more complex/expensive
+  when going into attached state.
+
+The downside of only updating secondary locations from S3 is that we will
+have a delay during migration from replaying the LSN range between what's
+in S3 and what's in the pageserver. This range will be very small on
+planned migrations, as we have the old pageserver flush to S3 immediately
+before attaching the new pageserver. On unplanned migrations (old pageserver
+is unavailable), the range of LSNs to replay is bounded by the flush frequency
+on the old pageserver. However, the migration doesn't have to wait for the
+replay: it's just that not-yet-replayed LSNs will be unavailable for read
+until the new pageserver catches up.
+
+We expect that pageserver reads of the most recent LSNs will be relatively
+rare, as for an active endpoint those pages will usually still be in the postgres
+page cache: this leads us to prefer synchronizing from S3 on secondary
+locations, rather than consuming the WAL from safekeepers.
+
+### Cold secondary locations
+
+It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then
+we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes).
+
+Warm caches on secondary locations are necessary to meet
+our availability goals.
+
+### Pageserver-granularity failover
+
+Instead of migrating tenants individually, we could have entire spare nodes,
+and on a node death, move all its work to one of these spares.
+
+This approach is avoided for several reasons:
+
+- we would still need fine-grained tenant migration for other
+  purposes such as balancing load
+- by sharing the spare capacity over many peers rather than one spare node,
+  these peers may use the capacity for other purposes, until it is needed
+  to handle migrated tenants. e.g. for keeping a deeper cache of their
+  attached tenants.
+
+### Readonly during migration
+
+We could simplify migrations by making both previous and new nodes go into a
+readonly state, then flush remote content from the previous node, then activate
+attachment on the secondary node.
+
+The downside to this approach is a potentially large gap in readability of
+recent LSNs while loading data onto the new node. To avoid this, it is worthwhile
+to incur the extra cost of double-replaying the WAL onto old and new nodes' local
+storage during a migration.
+
+### Peer-to-peer pageserver communication
+
+Rather than uploading the heatmap to S3, attached pageservers could make it
+available to peers.
+
+Currently, pageservers have no peer to peer communication, so adding this
+for heatmaps would incur significant overhead in deployment and configuration
+of the service, and ensuring that when a new pageserver is deployed, other
+pageservers are updated to be aware of it.
+
+As well as simplifying implementation, putting heatmaps in S3 will be useful
+for future analytics purposes -- gathering aggregated statistics on activity
+pattersn across many tenants may be done directly from data in S3.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -89,6 +89,8 @@ impl RemoteExtSpec {
        &self,
        ext_name: &str,
        is_library: bool,
+        build_tag: &str,
+        pg_major_version: &str,
    ) -> anyhow::Result<(String, RemotePath)> {
        let mut real_ext_name = ext_name;
        if is_library {
@@ -104,11 +106,32 @@ impl RemoteExtSpec {
                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
        }

+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &self.public_extensions {
+            if !public_extensions.contains(&real_ext_name.to_string()) {
+                if let Some(custom_extensions) = &self.custom_extensions {
+                    if !custom_extensions.contains(&real_ext_name.to_string()) {
+                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
+                    }
+                }
+            }
+        }
+
        match self.extension_data.get(real_ext_name) {
-            Some(ext_data) => Ok((
-                real_ext_name.to_string(),
-                RemotePath::from_string(&ext_data.archive_path)?,
-            )),
+            Some(_ext_data) => {
+                // Construct the path to the extension archive
+                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+                //
+                // Keep it in sync with path generation in
+                // https://github.com/neondatabase/build-custom-extensions/tree/main
+                let archive_path_str =
+                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                Ok((
+                    real_ext_name.to_string(),
+                    RemotePath::from_string(&archive_path_str)?,
+                ))
+            }
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -3,9 +3,9 @@
 //!
 use chrono::{DateTime, Utc};
 use rand::Rng;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -27,7 +27,8 @@ impl EventType {
    }

    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
+        // these can most likely be thought of as Range or RangeFull, at least pageserver creates
+        // incremental ranges where the stop and next start are equal.
        use EventType::*;
        match self {
            Incremental {
@@ -41,15 +42,25 @@ impl EventType {
    pub fn is_incremental(&self) -> bool {
        matches!(self, EventType::Incremental { .. })
    }
+
+    /// Returns the absolute time, or for incremental ranges, the stop time.
+    pub fn recorded_at(&self) -> &DateTime<Utc> {
+        use EventType::*;
+
+        match self {
+            Absolute { time } => time,
+            Incremental { stop_time, .. } => stop_time,
+        }
+    }
 }

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub struct Event<Extra> {
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct Event<Extra, Metric> {
    #[serde(flatten)]
    #[serde(rename = "type")]
    pub kind: EventType,

-    pub metric: &'static str,
+    pub metric: Metric,
    pub idempotency_key: String,
    pub value: u64,

@@ -58,19 +69,45 @@ pub struct Event<Extra> {
 }

 pub fn idempotency_key(node_id: &str) -> String {
-    format!(
-        "{}-{}-{:04}",
-        Utc::now(),
-        node_id,
-        rand::thread_rng().gen_range(0..=9999)
-    )
+    IdempotencyKey::generate(node_id).to_string()
+}
+
+/// Downstream users will use these to detect upload retries.
+pub struct IdempotencyKey<'a> {
+    now: chrono::DateTime<Utc>,
+    node_id: &'a str,
+    nonce: u16,
+}
+
+impl std::fmt::Display for IdempotencyKey<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce)
+    }
+}
+
+impl<'a> IdempotencyKey<'a> {
+    pub fn generate(node_id: &'a str) -> Self {
+        IdempotencyKey {
+            now: Utc::now(),
+            node_id,
+            nonce: rand::thread_rng().gen_range(0..=9999),
+        }
+    }
+
+    pub fn for_tests(now: DateTime<Utc>, node_id: &'a str, nonce: u16) -> Self {
+        IdempotencyKey {
+            now,
+            node_id,
+            nonce,
+        }
+    }
 }

 pub const CHUNK_SIZE: usize = 1000;

 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -0,0 +1,52 @@
+//! Types in this file are for pageserver's upward-facing API calls to the control plane,
+//! required for acquiring and validating tenant generation numbers.
+//!
+//! See docs/rfcs/025-generation-numbers.md
+
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use utils::id::{NodeId, TenantId};
+
+#[derive(Serialize, Deserialize)]
+pub struct ReAttachRequest {
+    pub node_id: NodeId,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub struct ReAttachResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ReAttachResponse {
+    pub tenants: Vec<ReAttachResponseTenant>,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub struct ValidateRequestTenant {
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
+    pub gen: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ValidateRequest {
+    pub tenants: Vec<ValidateRequestTenant>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ValidateResponse {
+    pub tenants: Vec<ValidateResponseTenant>,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub struct ValidateResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
+    pub valid: bool,
+}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,6 +1,7 @@
 use const_format::formatcp;

 /// Public API types
+pub mod control_api;
 pub mod models;
 pub mod reltag;

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -194,10 +194,22 @@ pub struct TimelineCreateRequest {
 pub struct TenantCreateRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLoadRequest {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+}
+
 impl std::ops::Deref for TenantCreateRequest {
    type Target = TenantConfig;

@@ -241,15 +253,6 @@ pub struct StatusResponse {
    pub id: NodeId,
 }

-impl TenantCreateRequest {
-    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
-        TenantCreateRequest {
-            new_tenant_id,
-            config: TenantConfig::default(),
-        }
-    }
-}
-
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
@@ -293,9 +296,11 @@ impl TenantConfigRequest {
    }
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct TenantAttachRequest {
    pub config: TenantAttachConfig,
+    #[serde(default)]
+    pub generation: Option<u32>,
 }

 /// Newtype to enforce deny_unknown_fields on TenantConfig for
@@ -358,8 +363,15 @@ pub struct TimelineInfo {
    pub latest_gc_cutoff_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
+
+    /// The LSN that we have succesfully uploaded to remote storage
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
+
+    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn_visible: Lsn,
+
    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
@@ -376,6 +388,8 @@ pub struct TimelineInfo {
    pub pg_version: u32,

    pub state: TimelineState,
+
+    pub walreceiver_status: String,
 }

 #[derive(Debug, Clone, Serialize)]
--- a/libs/postgres_ffi/README.md
+++ b/libs/postgres_ffi/README.md
@@ -10,9 +10,11 @@ should be auto-generated too, but that's a TODO.
 The PostgreSQL on-disk file format is not portable across different
 CPU architectures and operating systems. It is also subject to change
 in each major PostgreSQL version. Currently, this module supports
-PostgreSQL v14 and v15: bindings and code that depends on them are version-specific.
-This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15`
-Version independend code is explicitly exported into shared `postgres_ffi`.
+PostgreSQL v14, v15 and v16: bindings and code that depends on them are
+version-specific.
+This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and
+`postgres_ffi::v16`. Version independent code is explicitly exported into
+shared `postgres_ffi`.


 TODO: Currently, there is also some code that deals with WAL records
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
        PathBuf::from("pg_install")
    };

-    for pg_version in &["v14", "v15"] {
+    for pg_version in &["v14", "v15", "v16"] {
        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
        if pg_install_dir_versioned.is_relative() {
            let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> {
            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
+            .allowlist_type("RelMapFile")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -51,11 +51,59 @@ macro_rules! for_all_postgres_versions {
    ($macro:tt) => {
        $macro!(v14);
        $macro!(v15);
+        $macro!(v16);
    };
 }

 for_all_postgres_versions! { postgres_ffi }

+/// dispatch_pgversion
+///
+/// Run a code block in a context where the postgres_ffi bindings for a
+/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv
+/// identifier.
+/// If the provided pg_version is not supported, we panic!(), unless the
+/// optional third argument was provided (in which case that code will provide
+/// the default handling instead).
+///
+/// Use like
+///
+/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE })
+/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE)
+///
+/// Other uses are for macro-internal purposes only and strictly unsupported.
+///
+#[macro_export]
+macro_rules! dispatch_pgversion {
+    ($version:expr, $code:expr) => {
+        dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version))
+    };
+    ($version:expr, $code:expr, $invalid_pgver_handling:expr) => {
+        dispatch_pgversion!(
+            $version => $code,
+            default = $invalid_pgver_handling,
+            pgversions = [
+                14 : v14,
+                15 : v15,
+                16 : v16,
+            ]
+        )
+    };
+    ($pgversion:expr => $code:expr,
+     default = $default:expr,
+     pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
+        match ($pgversion) {
+            $($sv => {
+                use $crate::$vsv as pgv;
+                $code
+            },)+
+            _ => {
+                $default
+            }
+        }
+    };
+}
+
 pub mod pg_constants;
 pub mod relfile_utils;

@@ -90,13 +138,7 @@ pub use v14::xlog_utils::XLogFileName;
 pub use v14::bindings::DBState_DB_SHUTDOWNED;

 pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
-    match version {
-        14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
-        15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
-        _ => anyhow::bail!("Unknown version {}", version),
-    }
+    dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
 }

 pub fn generate_wal_segment(
@@ -107,11 +149,11 @@ pub fn generate_wal_segment(
 ) -> Result<Bytes, SerializeError> {
    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));

-    match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        _ => Err(SerializeError::BadInput),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        Err(SerializeError::BadInput)
+    )
 }

 pub fn generate_pg_control(
@@ -120,11 +162,11 @@ pub fn generate_pg_control(
    lsn: Lsn,
    pg_version: u32,
 ) -> anyhow::Result<(Bytes, u64)> {
-    match pg_version {
-        14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        _ => anyhow::bail!("Unknown version {}", pg_version),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
+        anyhow::bail!("Unknown version {}", pg_version)
+    )
 }

 // PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
@@ -196,8 +238,6 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
 }

 pub mod waldecoder {
-
-    use crate::{v14, v15};
    use bytes::{Buf, Bytes, BytesMut};
    use std::num::NonZeroU32;
    use thiserror::Error;
@@ -248,22 +288,17 @@ pub mod waldecoder {
        }

        pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-            match self.pg_version {
-                // This is a trick to support both versions simultaneously.
-                // See WalStreamDecoderHandler comments.
-                14 => {
-                    use self::v14::waldecoder_handler::WalStreamDecoderHandler;
+            dispatch_pgversion!(
+                self.pg_version,
+                {
+                    use pgv::waldecoder_handler::WalStreamDecoderHandler;
                    self.poll_decode_internal()
-                }
-                15 => {
-                    use self::v15::waldecoder_handler::WalStreamDecoderHandler;
-                    self.poll_decode_internal()
-                }
-                _ => Err(WalDecodeError {
+                },
+                Err(WalDecodeError {
                    msg: format!("Unknown version {}", self.pg_version),
                    lsn: self.lsn,
-                }),
-            }
+                })
+            )
        }
    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -137,9 +137,12 @@ pub const XLOG_HEAP_INSERT: u8 = 0x00;
 pub const XLOG_HEAP_DELETE: u8 = 0x10;
 pub const XLOG_HEAP_UPDATE: u8 = 0x20;
 pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
+pub const XLOG_HEAP_LOCK: u8 = 0x60;
 pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80;
 pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
 pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
+pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60;
+pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01;
 pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
 pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
@@ -163,6 +166,20 @@ pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
 pub const RM_LOGICALMSG_ID: u8 = 21;

+// from neon_rmgr.h
+pub const RM_NEON_ID: u8 = 134;
+
+pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80;
+
+pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00;
+pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10;
+pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20;
+pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30;
+pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40;
+pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50;
+
+pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40;
+
 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
 pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -3,3 +3,8 @@ pub const XLOG_DBASE_DROP: u8 = 0x10;

 pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -1,10 +1,18 @@
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
-pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
 pub const XLOG_DBASE_DROP: u8 = 0x20;

 pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
 pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -0,0 +1,18 @@
+pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
+
+pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
+pub const XLOG_DBASE_DROP: u8 = 0x20;
+
+pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
+pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -49,9 +49,9 @@ impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match self.pg_version {
-            14 => Ok(path.join(format!("v{}", self.pg_version))),
-            15 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }
@@ -250,11 +250,18 @@ fn craft_internal<C: postgres::GenericClient>(
    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
-        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
-            Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
-            Ordering::Equal => last_lsn,
-            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
-        },
+        Some(last_lsn) => {
+            let insert_lsn = client.pg_current_wal_insert_lsn()?;
+            match last_lsn.cmp(&insert_lsn) {
+                Ordering::Less => bail!(
+                    "Some records were inserted after the crafted WAL: {} vs {}",
+                    last_lsn,
+                    insert_lsn
+                ),
+                Ordering::Equal => last_lsn,
+                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+            }
+        }
    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
@@ -363,8 +370,9 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        );
        ensure!(
            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}",
-            after_xlog_switch
+            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
+            after_xlog_switch,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -959,7 +959,7 @@ mod tests {
        let make_params = |options| StartupMessageParams::new([("options", options)]);

        let params = StartupMessageParams::new([]);
-        assert!(matches!(params.options_escaped(), None));
+        assert!(params.options_escaped().is_none());

        let params = make_params("");
        assert!(split_options(&params).is_empty());
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -29,3 +29,4 @@ workspace_hack.workspace = true
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
+rand.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -20,6 +20,7 @@ use std::{

 use anyhow::{bail, Context};

+use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

+/// As defined in S3 docs
+pub const MAX_KEYS_PER_DELETE: usize = 1000;
+
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

 /// Path on the remote storage, relative to some inner prefix.
@@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

+impl Serialize for RemotePath {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(self)
+    }
+}
+
+impl<'de> Deserialize<'de> for RemotePath {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let str = String::deserialize(deserializer)?;
+        Ok(Self(PathBuf::from(&str)))
+    }
+}
+
 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0.display())
@@ -88,6 +111,10 @@ impl RemotePath {
    pub fn extension(&self) -> Option<&str> {
        self.0.extension()?.to_str()
    }
+
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+        self.0.strip_prefix(&p.0)
+    }
 }

 /// Storage (potentially remote) API to manage its state.
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -148,21 +148,55 @@ impl RemoteStorage for LocalFs {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
        };
-        let mut files = vec![];
-        let mut directory_queue = vec![full_path.clone()];

+        // If we were given a directory, we may use it as our starting point.
+        // Otherwise, we must go up to the parent directory.  This is because
+        // S3 object list prefixes can be arbitrary strings, but when reading
+        // the local filesystem we need a directory to start calling read_dir on.
+        let mut initial_dir = full_path.clone();
+        match fs::metadata(full_path.clone()).await {
+            Ok(meta) => {
+                if !meta.is_dir() {
+                    // It's not a directory: strip back to the parent
+                    initial_dir.pop();
+                }
+            }
+            Err(e) if e.kind() == ErrorKind::NotFound => {
+                // It's not a file that exists: strip the prefix back to the parent directory
+                initial_dir.pop();
+            }
+            Err(e) => {
+                // Unexpected I/O error
+                anyhow::bail!(e)
+            }
+        }
+
+        // Note that PathBuf starts_with only considers full path segments, but
+        // object prefixes are arbitrary strings, so we need the strings for doing
+        // starts_with later.
+        let prefix = full_path.to_string_lossy();
+
+        let mut files = vec![];
+        let mut directory_queue = vec![initial_dir.clone()];
        while let Some(cur_folder) = directory_queue.pop() {
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
                let full_file_name = cur_folder.clone().join(&file_name);
-                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
-                files.push(file_remote_path.clone());
-                if full_file_name.is_dir() {
-                    directory_queue.push(full_file_name);
+                if full_file_name
+                    .to_str()
+                    .map(|s| s.starts_with(prefix.as_ref()))
+                    .unwrap_or(false)
+                {
+                    let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
+                    files.push(file_remote_path.clone());
+                    if full_file_name.is_dir() {
+                        directory_queue.push(full_file_name);
+                    }
                }
            }
        }
+
        Ok(files)
    }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -33,11 +33,10 @@ use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
 pub(super) mod metrics;

 use self::metrics::{AttemptOutcome, RequestKind};
@@ -500,7 +499,7 @@ impl RemoteStorage for S3Bucket {
            delete_objects.push(obj_id);
        }

-        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
            let started_at = start_measuring_requests(kind);

            let resp = self
@@ -573,7 +572,7 @@ mod tests {

    #[test]
    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -378,21 +378,30 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
-    let random_prefix_part = std::time::SystemTime::now()
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
-        .as_nanos();
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
-            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+            prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -31,6 +31,8 @@ fn lsn_invalid() -> Lsn {
 #[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
+    /// Term.
+    pub term: Option<u64>,
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
@@ -58,4 +60,6 @@ pub struct SkTimelineInfo {
    /// A connection string to use for WAL receiving.
    #[serde(default)]
    pub safekeeper_connstr: Option<String>,
+    #[serde(default)]
+    pub http_connstr: Option<String>,
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -37,6 +38,7 @@ url.workspace = true
 uuid.workspace = true

 pq_proto.workspace = true
+postgres_connection.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -9,11 +9,12 @@ PORT=$4
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
 rm -fr "$DATA_DIR"
 env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
-echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
 cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
 cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
 for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -1,18 +1,31 @@
 use std::fmt::{Debug, Display};

 use futures::Future;
+use tokio_util::sync::CancellationToken;

 pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
 pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;

-pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+pub async fn exponential_backoff(
+    n: u32,
+    base_increment: f64,
+    max_seconds: f64,
+    cancel: &CancellationToken,
+) {
    let backoff_duration_seconds =
        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
    if backoff_duration_seconds > 0.0 {
        tracing::info!(
            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+
+        drop(
+            tokio::time::timeout(
+                std::time::Duration::from_secs_f64(backoff_duration_seconds),
+                cancel.cancelled(),
+            )
+            .await,
+        )
    }
 }

@@ -24,28 +37,57 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
    }
 }

+/// Configure cancellation for a retried operation: when to cancel (the token), and
+/// what kind of error to return on cancellation
+pub struct Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    token: CancellationToken,
+    on_cancel: CF,
+}
+
+impl<E, CF> Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
+        Self { token, on_cancel }
+    }
+}
+
 /// retries passed operation until one of the following conditions are met:
 /// Encountered error is considered as permanent (non-retryable)
 /// Retries have been exhausted.
 /// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
 /// When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-pub async fn retry<T, O, F, E>(
+/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
+/// to drop out promptly on shutdown.
+pub async fn retry<T, O, F, E, CF>(
    mut op: O,
    is_permanent: impl Fn(&E) -> bool,
    warn_threshold: u32,
    max_retries: u32,
    description: &str,
+    cancel: Cancel<E, CF>,
 ) -> Result<T, E>
 where
    // Not std::error::Error because anyhow::Error doesnt implement it.
    // For context see https://github.com/dtolnay/anyhow/issues/63
-    E: Display + Debug,
+    E: Display + Debug + 'static,
    O: FnMut() -> F,
    F: Future<Output = Result<T, E>>,
+    CF: Fn() -> E,
 {
    let mut attempts = 0;
    loop {
+        if cancel.token.is_cancelled() {
+            return Err((cancel.on_cancel)());
+        }
+
        let result = op().await;
        match result {
            Ok(_) => {
@@ -80,6 +122,7 @@ where
            attempts,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel.token,
        )
        .await;
        attempts += 1;
@@ -132,6 +175,7 @@ mod tests {
            1,
            1,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await;

@@ -157,6 +201,7 @@ mod tests {
            2,
            2,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap();
@@ -179,6 +224,7 @@ mod tests {
            2,
            2,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap_err();
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -0,0 +1,154 @@
+use std::fmt::Debug;
+
+use serde::{Deserialize, Serialize};
+
+/// Tenant generations are used to provide split-brain safety and allow
+/// multiple pageservers to attach the same tenant concurrently.
+///
+/// See docs/rfcs/025-generation-numbers.md for detail on how generation
+/// numbers are used.
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
+pub enum Generation {
+    // Generations with this magic value will not add a suffix to S3 keys, and will not
+    // be included in persisted index_part.json.  This value is only to be used
+    // during migration from pre-generation metadata to generation-aware metadata,
+    // and should eventually go away.
+    //
+    // A special Generation is used rather than always wrapping Generation in an Option,
+    // so that code handling generations doesn't have to be aware of the legacy
+    // case everywhere it touches a generation.
+    None,
+    // Generations with this magic value may never be used to construct S3 keys:
+    // we will panic if someone tries to.  This is for Tenants in the "Broken" state,
+    // so that we can satisfy their constructor with a Generation without risking
+    // a code bug using it in an S3 write (broken tenants should never write)
+    Broken,
+    Valid(u32),
+}
+
+/// The Generation type represents a number associated with a Tenant, which
+/// increments every time the tenant is attached to a new pageserver, or
+/// an attached pageserver restarts.
+///
+/// It is included as a suffix in S3 keys, as a protection against split-brain
+/// scenarios where pageservers might otherwise issue conflicting writes to
+/// remote storage
+impl Generation {
+    /// Create a new Generation that represents a legacy key format with
+    /// no generation suffix
+    pub fn none() -> Self {
+        Self::None
+    }
+
+    // Create a new generation that will panic if you try to use get_suffix
+    pub fn broken() -> Self {
+        Self::Broken
+    }
+
+    pub fn new(v: u32) -> Self {
+        Self::Valid(v)
+    }
+
+    pub fn is_none(&self) -> bool {
+        matches!(self, Self::None)
+    }
+
+    #[track_caller]
+    pub fn get_suffix(&self) -> String {
+        match self {
+            Self::Valid(v) => {
+                format!("-{:08x}", v)
+            }
+            Self::None => "".into(),
+            Self::Broken => {
+                panic!("Tried to use a broken generation");
+            }
+        }
+    }
+
+    /// `suffix` is the part after "-" in a key
+    ///
+    /// Returns None if parsing was unsuccessful
+    pub fn parse_suffix(suffix: &str) -> Option<Generation> {
+        u32::from_str_radix(suffix, 16).map(Generation::new).ok()
+    }
+
+    #[track_caller]
+    pub fn previous(&self) -> Generation {
+        match self {
+            Self::Valid(n) => {
+                if *n == 0 {
+                    // Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation
+                    // to 0 as being "no generation".
+                    Self::None
+                } else {
+                    Self::Valid(n - 1)
+                }
+            }
+            Self::None => Self::None,
+            Self::Broken => panic!("Attempted to use a broken generation"),
+        }
+    }
+
+    pub fn next(&self) -> Generation {
+        match self {
+            Self::Valid(n) => Self::Valid(*n + 1),
+            Self::None => Self::Valid(1),
+            Self::Broken => panic!("Attempted to use a broken generation"),
+        }
+    }
+
+    pub fn into(self) -> Option<u32> {
+        if let Self::Valid(v) = self {
+            Some(v)
+        } else {
+            None
+        }
+    }
+}
+
+impl Serialize for Generation {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if let Self::Valid(v) = self {
+            v.serialize(serializer)
+        } else {
+            // We should never be asked to serialize a None or Broken.  Structures
+            // that include an optional generation should convert None to an
+            // Option<Generation>::None
+            Err(serde::ser::Error::custom(
+                "Tried to serialize invalid generation ({self})",
+            ))
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Generation {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        Ok(Self::Valid(u32::deserialize(deserializer)?))
+    }
+}
+
+// We intentionally do not implement Display for Generation, to reduce the
+// risk of a bug where the generation is used in a format!() string directly
+// instead of using get_suffix().
+impl Debug for Generation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Valid(v) => {
+                write!(f, "{:08x}", v)
+            }
+            Self::None => {
+                write!(f, "<none>")
+            }
+            Self::Broken => {
+                write!(f, "<broken>")
+            }
+        }
+    }
+}
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,6 +24,9 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

+    #[error("Shutting down")]
+    ShuttingDown,
+
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -52,6 +55,10 @@ impl ApiError {
                self.to_string(),
                StatusCode::PRECONDITION_FAILED,
            ),
+            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+                "Shutting down".to_string(),
+                StatusCode::SERVICE_UNAVAILABLE,
+            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -27,6 +27,9 @@ pub mod id;
 // http endpoint utils
 pub mod http;

+// definition of the Generation type for pageserver attachment APIs
+pub mod generation;
+
 // common log initialisation routine
 pub mod logging;

@@ -58,6 +61,8 @@ pub mod serde_regex;

 pub mod pageserver_feedback;

+pub mod postgres_client;
+
 pub mod tracing_span_assert;

 pub mod rate_limit;
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -216,6 +216,24 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
    }
 }

+/// When you will store a secret but want to make sure it won't
+/// be accidentally logged, wrap it in a SecretString, whose Debug
+/// implementation does not expose the contents.
+#[derive(Clone, Eq, PartialEq)]
+pub struct SecretString(String);
+
+impl SecretString {
+    pub fn get_contents(&self) -> &str {
+        self.0.as_str()
+    }
+}
+
+impl std::fmt::Debug for SecretString {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[SECRET]")
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use metrics::{core::Opts, IntCounterVec};
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -0,0 +1,37 @@
+//! Postgres client connection code common to other crates (safekeeper and
+//! pageserver) which depends on tenant/timeline ids and thus not fitting into
+//! postgres_connection crate.
+
+use anyhow::Context;
+use postgres_connection::{parse_host_port, PgConnectionConfig};
+
+use crate::id::TenantTimelineId;
+
+/// Create client config for fetching WAL from safekeeper on particular timeline.
+/// listen_pg_addr_str is in form host:\[port\].
+pub fn wal_stream_connection_config(
+    TenantTimelineId {
+        tenant_id,
+        timeline_id,
+    }: TenantTimelineId,
+    listen_pg_addr_str: &str,
+    auth_token: Option<&str>,
+    availability_zone: Option<&str>,
+) -> anyhow::Result<PgConnectionConfig> {
+    let (host, port) =
+        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+    let port = port.unwrap_or(5432);
+    let mut connstr = PgConnectionConfig::new_host_port(host, port)
+        .extend_options([
+            "-c".to_owned(),
+            format!("timeline_id={}", timeline_id),
+            format!("tenant_id={}", tenant_id),
+        ])
+        .set_password(auth_token.map(|s| s.to_owned()));
+
+    if let Some(availability_zone) = availability_zone {
+        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
+    }
+
+    Ok(connstr)
+}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "vm_monitor"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "vm-monitor"
+path = "./src/bin/monitor.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+axum.workspace = true
+clap.workspace = true
+futures.workspace = true
+inotify.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+sysinfo.workspace = true
+tokio.workspace = true
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -0,0 +1,34 @@
+# `vm-monitor`
+
+The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
+along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
+two primary roles: 1) notifying agents when immediate upscaling is necessary due
+to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
+out upscaling and downscaling decisions.
+
+## More on scaling
+
+We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
+To control thresholds for receiving memory usage notifications, we start Postgres
+in the `neon-postgres` cgroup and set its `memory.{max,high}`.
+
+* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
+* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
+where initial development of the monitor happened. The repository is no longer
+maintained but the commit history may be useful for debugging.
+
+## Structure
+
+The `vm-monitor` is loosely comprised of a few systems. These are:
+* the server: this is just a simple `axum` server that accepts requests and
+upgrades them to websocket connections. The server only allows one connection at
+a time. This means that upon receiving a new connection, the server will terminate
+and old one if it exists.
+* the filecache: a struct that allows communication with the Postgres file cache.
+On startup, we connect to the filecache and hold on to the connection for the
+entire monitor lifetime.
+* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
+listening for `memory.high` events and setting its `memory.{high,max}` values.
+* the runner: the runner marries the filecache and cgroup watcher together,
+communicating with the agent throught the `Dispatcher`, and then calling filecache
+and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/bin/monitor.rs
+++ b/libs/vm_monitor/src/bin/monitor.rs
@@ -0,0 +1,33 @@
+// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
+// we can test the monitor as part of the entire autoscaling system in
+// neondatabase/autoscaling.
+//
+// The monitor was previously started by vm-builder, and for testing purposes,
+// we can mimic that setup with this binary.
+
+#[cfg(target_os = "linux")]
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    use clap::Parser;
+    use tokio_util::sync::CancellationToken;
+    use tracing_subscriber::EnvFilter;
+    use vm_monitor::Args;
+
+    let subscriber = tracing_subscriber::fmt::Subscriber::builder()
+        .json()
+        .with_file(true)
+        .with_line_number(true)
+        .with_span_list(true)
+        .with_env_filter(EnvFilter::from_default_env())
+        .finish();
+    tracing::subscriber::set_global_default(subscriber)?;
+
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+    let token = CancellationToken::new();
+    vm_monitor::start(args, token).await
+}
+
+#[cfg(not(target_os = "linux"))]
+fn main() {
+    panic!("the monitor requires cgroups, which are only available on linux")
+}
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -0,0 +1,654 @@
+use std::{
+    fmt::{Debug, Display},
+    fs,
+    pin::pin,
+    sync::atomic::{AtomicU64, Ordering},
+};
+
+use anyhow::{anyhow, bail, Context};
+use cgroups_rs::{
+    freezer::FreezerController,
+    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    memory::MemController,
+    MaxValue,
+    Subsystem::{Freezer, Mem},
+};
+use inotify::{EventStream, Inotify, WatchMask};
+use tokio::sync::mpsc::{self, error::TryRecvError};
+use tokio::time::{Duration, Instant};
+use tokio_stream::{Stream, StreamExt};
+use tracing::{info, warn};
+
+use crate::protocol::Resources;
+use crate::MiB;
+
+/// Monotonically increasing counter of the number of memory.high events
+/// the cgroup has experienced.
+///
+/// We use this to determine if a modification to the `memory.events` file actually
+/// changed the `high` field. If not, we don't care about the change. When we
+/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
+/// to see if it changed since last time.
+pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Monotonically increasing counter that gives each cgroup event a unique id.
+///
+/// This allows us to answer questions like "did this upscale arrive before this
+/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
+/// with a sequence number. As such, prefer to used the `Sequenced` type rather
+/// than this static directly.
+static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
+
+/// A memory event type reported in memory.events.
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub enum MemoryEvent {
+    Low,
+    High,
+    Max,
+    Oom,
+    OomKill,
+    OomGroupKill,
+}
+
+impl MemoryEvent {
+    fn as_str(&self) -> &str {
+        match self {
+            MemoryEvent::Low => "low",
+            MemoryEvent::High => "high",
+            MemoryEvent::Max => "max",
+            MemoryEvent::Oom => "oom",
+            MemoryEvent::OomKill => "oom_kill",
+            MemoryEvent::OomGroupKill => "oom_group_kill",
+        }
+    }
+}
+
+impl Display for MemoryEvent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+/// Configuration for a `CgroupWatcher`
+#[derive(Debug, Clone)]
+pub struct Config {
+    // The target difference between the total memory reserved for the cgroup
+    // and the value of the cgroup's memory.high.
+    //
+    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
+    // use (equal to system memory, minus whatever's taken out for the file cache).
+    oom_buffer_bytes: u64,
+
+    // The amount of memory, in bytes, below a proposed new value for
+    // memory.high that the cgroup's memory usage must be for us to downscale
+    //
+    // In other words, we can downscale only when:
+    //
+    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
+    //
+    // TODO: there's some minor issues with this approach -- in particular, that we might have
+    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
+    pub(crate) memory_high_buffer_bytes: u64,
+
+    // The maximum duration, in milliseconds, that we're allowed to pause
+    // the cgroup for while waiting for the autoscaler-agent to upscale us
+    max_upscale_wait: Duration,
+
+    // The required minimum time, in milliseconds, that we must wait before re-freezing
+    // the cgroup while waiting for the autoscaler-agent to upscale us.
+    do_not_freeze_more_often_than: Duration,
+
+    // The amount of memory, in bytes, that we should periodically increase memory.high
+    // by while waiting for the autoscaler-agent to upscale us.
+    //
+    // This exists to avoid the excessive throttling that happens when a cgroup is above its
+    // memory.high for too long. See more here:
+    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
+    memory_high_increase_by_bytes: u64,
+
+    // The period, in milliseconds, at which we should repeatedly increase the value
+    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
+    // is still being hit.
+    //
+    // Technically speaking, this actually serves as a rate limit to moderate responding to
+    // memory.high events, but these are roughly equivalent if the process is still allocating
+    // memory.
+    memory_high_increase_every: Duration,
+}
+
+impl Config {
+    /// Calculate the new value for the cgroups memory.high based on system memory
+    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
+        total_system_mem.saturating_sub(self.oom_buffer_bytes)
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            oom_buffer_bytes: 100 * MiB,
+            memory_high_buffer_bytes: 100 * MiB,
+            // while waiting for upscale, don't freeze for more than 20ms every 1s
+            max_upscale_wait: Duration::from_millis(20),
+            do_not_freeze_more_often_than: Duration::from_millis(1000),
+            // while waiting for upscale, increase memory.high by 10MiB every 25ms
+            memory_high_increase_by_bytes: 10 * MiB,
+            memory_high_increase_every: Duration::from_millis(25),
+        }
+    }
+}
+
+/// Used to represent data that is associated with a certain point in time, such
+/// as an upscale request or memory.high event.
+///
+/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
+/// a unique sequence number. Sequence numbers are monotonically increasing,
+/// allowing us to answer questions like "did this upscale happen after this
+/// memory.high event?" by comparing the sequence numbers of the two events.
+#[derive(Debug, Clone)]
+pub struct Sequenced<T> {
+    seqnum: u64,
+    data: T,
+}
+
+impl<T> Sequenced<T> {
+    pub fn new(data: T) -> Self {
+        Self {
+            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
+            data,
+        }
+    }
+}
+
+/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
+/// OOM killed or throttling.
+///
+/// The `CgroupWatcher` primarily achieves this by reading from a stream of
+/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
+/// cgroup happy.
+#[derive(Debug)]
+pub struct CgroupWatcher {
+    pub config: Config,
+
+    /// The sequence number of the last upscale.
+    ///
+    /// If we receive a memory.high event that has a _lower_ sequence number than
+    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
+    /// can safely ignore it.
+    ///
+    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
+    /// use it anyways so that methods take `&self`, not `&mut self`.
+    last_upscale_seqnum: AtomicU64,
+
+    /// A channel on which we send messages to request upscale from the dispatcher.
+    upscale_requester: mpsc::Sender<()>,
+
+    /// The actual cgroup we are watching and managing.
+    cgroup: cgroups_rs::Cgroup,
+}
+
+/// Read memory.events for the desired event type.
+///
+/// `path` specifies the path to the desired `memory.events` file.
+/// For more info, see the `memory.events` section of the [kernel docs]
+/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
+fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read memory.events from {path}"))?;
+
+    // Then contents of the file look like:
+    // low 42
+    // high 101
+    // ...
+    contents
+        .lines()
+        .filter_map(|s| s.split_once(' '))
+        .find(|(e, _)| *e == event.as_str())
+        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
+        .and_then(|(_, count)| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("failed to parse memory.{event} as u64"))
+        })
+}
+
+/// Create an event stream that produces events whenever the file at the provided
+/// path is modified.
+fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
+    info!("creating file watcher for {path}");
+    let inotify = Inotify::init().context("failed to initialize file watcher")?;
+    inotify
+        .watches()
+        .add(path, WatchMask::MODIFY)
+        .with_context(|| format!("failed to start watching {path}"))?;
+    inotify
+        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
+        // to store one event at a time - if the event gets written over, that's
+        // ok. We still see that there is an event. For more information, see:
+        // https://man7.org/linux/man-pages/man7/inotify.7.html
+        .into_event_stream([0u8; 1024])
+        .context("failed to start inotify event stream")
+}
+
+impl CgroupWatcher {
+    /// Create a new `CgroupWatcher`.
+    #[tracing::instrument(skip_all, fields(%name))]
+    pub fn new(
+        name: String,
+        // A channel on which to send upscale requests
+        upscale_requester: mpsc::Sender<()>,
+    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+        // TODO: clarify exactly why we need v2
+        // Make sure cgroups v2 (aka unified) are supported
+        if !is_cgroup2_unified_mode() {
+            anyhow::bail!("cgroups v2 not supported");
+        }
+        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
+
+        // Start monitoring the cgroup for memory events. In general, for
+        // cgroups v2 (aka unified), metrics are reported in files like
+        // > `/sys/fs/cgroup/{name}/{metric}`
+        // We are looking for `memory.high` events, which are stored in the
+        // file `memory.events`. For more info, see the `memory.events` section
+        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
+        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
+        let memory_events = create_file_watcher(&path)
+            .with_context(|| format!("failed to create event watcher for {path}"))?
+            // This would be nice with with .inspect_err followed by .ok
+            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
+                Ok(high) => Some(high),
+                Err(error) => {
+                    // TODO: Might want to just panic here
+                    warn!(?error, "failed to read high events count from {}", &path);
+                    None
+                }
+            })
+            // Only report the event if the memory.high count increased
+            .filter_map(|high| {
+                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
+                    Some(high)
+                } else {
+                    None
+                }
+            })
+            .map(Sequenced::new);
+
+        let initial_count = get_event_count(
+            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
+            MemoryEvent::High,
+        )?;
+
+        info!(initial_count, "initial memory.high event count");
+
+        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
+        // running in the cgroup before that caused it to be non-zero.
+        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
+
+        Ok((
+            Self {
+                cgroup,
+                upscale_requester,
+                last_upscale_seqnum: AtomicU64::new(0),
+                config: Default::default(),
+            },
+            memory_events,
+        ))
+    }
+
+    /// The entrypoint for the `CgroupWatcher`.
+    #[tracing::instrument(skip_all)]
+    pub async fn watch<E>(
+        &self,
+        // These are ~dependency injected~ (fancy, I know) because this function
+        // should never return.
+        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
+        // -> therefore: if we want to stick it in an Arc so many threads can access
+        //    it, methods can never take mutable access.
+        //     - note: we use the Arc strategy so that a) we can call this function
+        //             right here and b) the runner can call the set/get_memory methods
+        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
+        //    we just pass them in here instead of holding them in fields, as that
+        //    would require this method to take &mut self.
+        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
+        events: E,
+    ) -> anyhow::Result<()>
+    where
+        E: Stream<Item = Sequenced<u64>>,
+    {
+        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
+        let mut events = pin!(events);
+
+        // Are we waiting to be upscaled? Could be true if we request upscale due
+        // to a memory.high event and it does not arrive in time.
+        let mut waiting_on_upscale = false;
+
+        loop {
+            tokio::select! {
+                upscale = upscales.recv() => {
+                    let Sequenced { seqnum, data } = upscale
+                        .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
+                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+                }
+                event = events.next() => {
+                    let Some(Sequenced { seqnum, .. }) = event else {
+                        bail!("failed to listen for memory.high events")
+                    };
+                    // The memory.high came before our last upscale, so we consider
+                    // it resolved
+                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
+                        info!(
+                            "received memory.high event, but it came before our last upscale -> ignoring it"
+                        );
+                        continue;
+                    }
+
+                    // The memory.high came after our latest upscale. We don't
+                    // want to do anything yet, so peek the next event in hopes
+                    // that it's an upscale.
+                    if let Some(upscale_num) = self
+                        .upscaled(&mut upscales)
+                        .context("failed to check if we were upscaled")?
+                    {
+                        if upscale_num > seqnum {
+                            info!(
+                                "received memory.high event, but it came before our last upscale -> ignoring it"
+                            );
+                            continue;
+                        }
+                    }
+
+                    // If it's been long enough since we last froze, freeze the
+                    // cgroup and request upscale
+                    if wait_to_freeze.is_elapsed() {
+                        info!("received memory.high event -> requesting upscale");
+                        waiting_on_upscale = self
+                            .handle_memory_high_event(&mut upscales)
+                            .await
+                            .context("failed to handle upscale")?;
+                        wait_to_freeze
+                            .as_mut()
+                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
+                        continue;
+                    }
+
+                    // Ok, we can't freeze, just request upscale
+                    if !waiting_on_upscale {
+                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to request upscaling because we got upscaled");
+                            continue;
+                        }
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
+                        continue;
+                    }
+
+                    // Shoot, we can't freeze or and we're still waiting on upscale,
+                    // increase memory.high to reduce throttling
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
+                        info!(
+                            "received memory.high event, \
+                            but too soon to refreeze and already requested upscale \
+                            -> increasing memory.high"
+                        );
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to increase memory.high because got upscaled");
+                            continue;
+                        }
+
+                        // Request upscale anyways (the agent will handle deduplicating
+                        // requests)
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+
+                        let memory_high =
+                            self.get_memory_high_bytes().context("failed to get memory.high")?;
+                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
+                        info!(
+                            current_high_bytes = memory_high,
+                            new_high_bytes = new_high,
+                            "updating memory.high"
+                        );
+                        self.set_memory_high_bytes(new_high)
+                            .context("failed to set memory.high")?;
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
+                    }
+
+                    info!("received memory.high event, but can't do anything");
+                }
+            };
+        }
+    }
+
+    /// Handle a `memory.high`, returning whether we are still waiting on upscale
+    /// by the time the function returns.
+    ///
+    /// The general plan for handling a `memory.high` event is as follows:
+    /// 1. Freeze the cgroup
+    /// 2. Start a timer for `self.config.max_upscale_wait`
+    /// 3. Request upscale
+    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
+    /// 5. Return whether or not we are still waiting for upscale. If we are,
+    ///    we'll increase the cgroups memory.high to avoid getting oom killed
+    #[tracing::instrument(skip_all)]
+    async fn handle_memory_high_event(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<bool> {
+        // Immediately freeze the cgroup before doing anything else.
+        info!("received memory.high event -> freezing cgroup");
+        self.freeze().context("failed to freeze cgroup")?;
+
+        // We'll use this for logging durations
+        let start_time = Instant::now();
+
+        // Await the upscale until we have to unfreeze
+        let timed =
+            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
+
+        // Request the upscale
+        info!(
+            wait = ?self.config.max_upscale_wait,
+            "sending request for immediate upscaling",
+        );
+        self.upscale_requester
+            .send(())
+            .await
+            .context("failed to request upscale")?;
+
+        let waiting_on_upscale = match timed.await {
+            Ok(Ok(())) => {
+                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
+                false
+            }
+            // **important**: unfreeze the cgroup before ?-reporting the error
+            Ok(Err(e)) => {
+                info!("error waiting for upscale -> thawing cgroup");
+                self.thaw()
+                    .context("failed to thaw cgroup after errored waiting for upscale")?;
+                Err(e.context("failed to await upscale"))?
+            }
+            Err(_) => {
+                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
+                true
+            }
+        };
+
+        info!("thawing cgroup");
+        self.thaw().context("failed to thaw cgroup")?;
+
+        Ok(waiting_on_upscale)
+    }
+
+    /// Checks whether we were just upscaled, returning the upscale's sequence
+    /// number if so.
+    #[tracing::instrument(skip_all)]
+    fn upscaled(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<Option<u64>> {
+        let Sequenced { seqnum, data } = match upscales.try_recv() {
+            Ok(upscale) => upscale,
+            Err(TryRecvError::Empty) => return Ok(None),
+            Err(TryRecvError::Disconnected) => {
+                bail!("upscale notification channel was disconnected")
+            }
+        };
+
+        // Make sure to update the last upscale sequence number
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+        Ok(Some(seqnum))
+    }
+
+    /// Await an upscale event, discarding any `memory.high` events received in
+    /// the process.
+    ///
+    /// This is used in `handle_memory_high_event`, where we need to listen
+    /// for upscales in particular so we know if we can thaw the cgroup early.
+    #[tracing::instrument(skip_all)]
+    async fn await_upscale(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<()> {
+        let Sequenced { seqnum, .. } = upscales
+            .recv()
+            .await
+            .context("error listening for upscales")?;
+
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        Ok(())
+    }
+
+    /// Get the cgroup's name.
+    pub fn path(&self) -> &str {
+        self.cgroup.path()
+    }
+}
+
+// Methods for manipulating the actual cgroup
+impl CgroupWatcher {
+    /// Get a handle on the freezer subsystem.
+    fn freezer(&self) -> anyhow::Result<&FreezerController> {
+        if let Some(Freezer(freezer)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Freezer(_)))
+        {
+            Ok(freezer)
+        } else {
+            anyhow::bail!("could not find freezer subsystem")
+        }
+    }
+
+    /// Attempt to freeze the cgroup.
+    pub fn freeze(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .freeze()
+            .context("failed to freeze")
+    }
+
+    /// Attempt to thaw the cgroup.
+    pub fn thaw(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .thaw()
+            .context("failed to thaw")
+    }
+
+    /// Get a handle on the memory subsystem.
+    ///
+    /// Note: this method does not require `self.memory_update_lock` because
+    /// getting a handle to the subsystem does not access any of the files we
+    /// care about, such as memory.high and memory.events
+    fn memory(&self) -> anyhow::Result<&MemController> {
+        if let Some(Mem(memory)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Mem(_)))
+        {
+            Ok(memory)
+        } else {
+            anyhow::bail!("could not find memory subsystem")
+        }
+    }
+
+    /// Get cgroup current memory usage.
+    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
+        Ok(self
+            .memory()
+            .context("failed to get memory subsystem")?
+            .memory_stat()
+            .usage_in_bytes)
+    }
+
+    /// Set cgroup memory.high threshold.
+    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+    }
+
+    /// Set the cgroup's memory.high to 'max', disabling it.
+    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Max)
+    }
+
+    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
+        self.memory()
+            .context("failed to get memory subsystem")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                low: None,
+                high: Some(value),
+                min: None,
+                max: None,
+            })
+            .map_err(anyhow::Error::from)
+    }
+
+    /// Get memory.high threshold.
+    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
+        let high = self
+            .memory()
+            .context("failed to get memory subsystem while getting memory statistics")?
+            .get_mem()
+            .map(|mem| mem.high)
+            .context("failed to get memory statistics from subsystem")?;
+        match high {
+            Some(MaxValue::Max) => Ok(i64::MAX as u64),
+            Some(MaxValue::Value(high)) => Ok(high as u64),
+            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+        }
+    }
+}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -0,0 +1,153 @@
+//! Managing the websocket connection and other signals in the monitor.
+//!
+//! Contains types that manage the interaction (not data interchange, see `protocol`)
+//! between agent and monitor, allowing us to to process and send messages in a
+//! straightforward way. The dispatcher also manages that signals that come from
+//! the cgroup (requesting upscale), and the signals that go to the cgroup
+//! (notifying it of upscale).
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::{
+    stream::{SplitSink, SplitStream},
+    SinkExt, StreamExt,
+};
+use tokio::sync::mpsc;
+use tracing::info;
+
+use crate::cgroup::Sequenced;
+use crate::protocol::{
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    PROTOCOL_MIN_VERSION,
+};
+
+/// The central handler for all communications in the monitor.
+///
+/// The dispatcher has two purposes:
+/// 1. Manage the connection to the agent, sending and receiving messages.
+/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
+///    and sending a message to the agent when the cgroup manager requests
+///    upscale.
+#[derive(Debug)]
+pub struct Dispatcher {
+    /// We read agent messages of of `source`
+    pub(crate) source: SplitStream<WebSocket>,
+
+    /// We send messages to the agent through `sink`
+    sink: SplitSink<WebSocket, Message>,
+
+    /// Used to notify the cgroup when we are upscaled.
+    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+
+    /// When the cgroup requests upscale it will send on this channel. In response
+    /// we send an `UpscaleRequst` to the agent.
+    pub(crate) request_upscale_events: mpsc::Receiver<()>,
+
+    /// The protocol version we have agreed to use with the agent. This is negotiated
+    /// during the creation of the dispatcher, and should be the highest shared protocol
+    /// version.
+    ///
+    // NOTE: currently unused, but will almost certainly be used in the futures
+    // as the protocol changes
+    #[allow(unused)]
+    pub(crate) proto_version: ProtocolVersion,
+}
+
+impl Dispatcher {
+    /// Creates a new dispatcher using the passed-in connection.
+    ///
+    /// Performs a negotiation with the agent to determine the highest protocol
+    /// version that both support. This consists of two steps:
+    /// 1. Wait for the agent to sent the range of protocols it supports.
+    /// 2. Send a protocol version that works for us as well, or an error if there
+    ///    is no compatible version.
+    pub async fn new(
+        stream: WebSocket,
+        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+        request_upscale_events: mpsc::Receiver<()>,
+    ) -> anyhow::Result<Self> {
+        let (mut sink, mut source) = stream.split();
+
+        // Figure out the highest protocol version we both support
+        info!("waiting for agent to send protocol version range");
+        let Some(message) = source.next().await else {
+            bail!("websocket connection closed while performing protocol handshake")
+        };
+
+        let message = message.context("failed to read protocol version range off connection")?;
+
+        let Message::Text(message_text) = message else {
+            // All messages should be in text form, since we don't do any
+            // pinging/ponging. See nhooyr/websocket's implementation and the
+            // agent for more info
+            bail!("received non-text message during proocol handshake: {message:?}")
+        };
+
+        let monitor_range = ProtocolRange {
+            min: PROTOCOL_MIN_VERSION,
+            max: PROTOCOL_MAX_VERSION,
+        };
+
+        let agent_range: ProtocolRange = serde_json::from_str(&message_text)
+            .context("failed to deserialize protocol version range")?;
+
+        info!(range = ?agent_range, "received protocol version range");
+
+        let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
+            Ok(version) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
+                ))
+                .await
+                .context("failed to notify agent of negotiated protocol version")?;
+                version
+            }
+            Err(e) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Error(format!(
+                        "Received protocol version range {} which does not overlap with {}",
+                        agent_range, monitor_range
+                    )))
+                    .unwrap(),
+                ))
+                .await
+                .context("failed to notify agent of no overlap between protocol version ranges")?;
+                Err(e).context("error determining suitable protocol version range")?
+            }
+        };
+
+        Ok(Self {
+            sink,
+            source,
+            notify_upscale_events,
+            request_upscale_events,
+            proto_version: highest_shared_version,
+        })
+    }
+
+    /// Notify the cgroup manager that we have received upscale and wait for
+    /// the acknowledgement.
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
+        self.notify_upscale_events
+            .send(resources)
+            .await
+            .context("failed to send resources and oneshot sender across channel")
+    }
+
+    /// Send a message to the agent.
+    ///
+    /// Although this function is small, it has one major benefit: it is the only
+    /// way to send data accross the connection, and you can only pass in a proper
+    /// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
+    /// serialize the wrong thing and send it, since `self.sink.send` will take
+    /// any string.
+    pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
+        info!(?message, "sending message");
+        let json = serde_json::to_string(&message).context("failed to serialize message")?;
+        self.sink
+            .send(Message::Text(json))
+            .await
+            .context("stream error sending message")
+    }
+}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -0,0 +1,316 @@
+//! Logic for configuring and scaling the Postgres file cache.
+
+use std::num::NonZeroU64;
+
+use crate::MiB;
+use anyhow::{anyhow, Context};
+use tokio_postgres::{types::ToSql, Client, NoTls, Row};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+/// Manages Postgres' file cache by keeping a connection open.
+#[derive(Debug)]
+pub struct FileCacheState {
+    client: Client,
+    conn_str: String,
+    pub(crate) config: FileCacheConfig,
+
+    /// A token for cancelling spawned threads during shutdown.
+    token: CancellationToken,
+}
+
+#[derive(Debug)]
+pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
+    /// The size of the file cache, in terms of the size of the resource it consumes
+    /// (currently: only memory)
+    ///
+    /// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
+    /// resources.
+    ///
+    /// This value must be strictly between 0 and 1.
+    resource_multiplier: f64,
+
+    /// The required minimum amount of memory, in bytes, that must remain available
+    /// after subtracting the file cache.
+    ///
+    /// This value must be non-zero.
+    min_remaining_after_cache: NonZeroU64,
+
+    /// Controls the rate of increase in the file cache's size as it grows from zero
+    /// (when total resources equals min_remaining_after_cache) to the desired size based on
+    /// `resource_multiplier`.
+    ///
+    /// A `spread_factor` of zero means that all additional resources will go to the cache until it
+    /// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
+    /// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
+    /// its desired size".
+    ///
+    /// This value must be >= 0, and must retain an increase that is more than what would be given by
+    /// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
+    /// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
+    /// as desired by `resource_multiplier`.
+    ///
+    /// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
+    spread_factor: f64,
+}
+
+impl FileCacheConfig {
+    pub fn default_in_memory() -> Self {
+        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+
+    pub fn default_on_disk() -> Self {
+        Self {
+            in_memory: false,
+            resource_multiplier: 0.75,
+            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
+            // memory, the kernel will just evict from its page cache, rather than e.g. killing
+            // everything.
+            min_remaining_after_cache: NonZeroU64::new(256 * MiB).unwrap(),
+            spread_factor: 0.1,
+        }
+    }
+
+    /// Make sure fields of the config are consistent.
+    pub fn validate(&self) -> anyhow::Result<()> {
+        // Single field validity
+        anyhow::ensure!(
+            0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
+            "resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
+            self.resource_multiplier
+        );
+        anyhow::ensure!(
+            self.spread_factor >= 0.0,
+            "spread_factor must be >= 0, got {}",
+            self.spread_factor
+        );
+
+        // Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
+        //
+        // As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
+        // `spread_factor`, respectively. They are:
+        //
+        //                 `total`           `min_remaining_after_cache`
+        //   size = ————————————————————— - —————————————————————————————
+        //           `spread_factor` + 1         `spread_factor` + 1
+        //
+        // and
+        //
+        //   size = `resource_multiplier` × total
+        //
+        // .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
+        // form, with y = "size" and x = "total".
+        //
+        // These lines intersect at:
+        //
+        //               `min_remaining_after_cache`
+        //   ———————————————————————————————————————————————————
+        //    1 - `resource_multiplier` × (`spread_factor` + 1)
+        //
+        // We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
+        // guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
+        // (We also need it to be >= 0, but that's already guaranteed.)
+
+        let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
+        anyhow::ensure!(
+            intersect_factor < 1.0,
+            "incompatible resource_multipler and spread_factor"
+        );
+        Ok(())
+    }
+
+    /// Calculate the desired size of the cache, given the total memory
+    pub fn calculate_cache_size(&self, total: u64) -> u64 {
+        // *Note*: all units are in bytes, until the very last line.
+        let available = total.saturating_sub(self.min_remaining_after_cache.get());
+        if available == 0 {
+            return 0;
+        }
+
+        // Conversions to ensure we don't overflow from floating-point ops
+        let size_from_spread =
+            i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
+
+        let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
+
+        let byte_size = u64::min(size_from_spread, size_from_normal);
+
+        // The file cache operates in units of mebibytes, so the sizes we produce should
+        // be rounded to a mebibyte. We round down to be conservative.
+        byte_size / MiB * MiB
+    }
+}
+
+impl FileCacheState {
+    /// Connect to the file cache.
+    #[tracing::instrument(skip_all, fields(%conn_str, ?config))]
+    pub async fn new(
+        conn_str: &str,
+        config: FileCacheConfig,
+        token: CancellationToken,
+    ) -> anyhow::Result<Self> {
+        config.validate().context("file cache config is invalid")?;
+
+        info!(conn_str, "connecting to Postgres file cache");
+        let client = FileCacheState::connect(conn_str, token.clone())
+            .await
+            .context("failed to connect to postgres file cache")?;
+
+        let conn_str = conn_str.to_string();
+        Ok(Self {
+            client,
+            config,
+            conn_str,
+            token,
+        })
+    }
+
+    /// Connect to Postgres.
+    ///
+    /// Aborts the spawned thread if the kill signal is received. This is not
+    /// a method as it is called in [`FileCacheState::new`].
+    #[tracing::instrument(skip_all, fields(%conn_str))]
+    async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
+        let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
+            .await
+            .context("failed to connect to pg client")?;
+
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own. See tokio-postgres docs.
+        crate::spawn_with_cancel(
+            token,
+            |res| {
+                if let Err(error) = res {
+                    error!(%error, "postgres error")
+                }
+            },
+            conn,
+        );
+
+        Ok(client)
+    }
+
+    /// Execute a query with a retry if necessary.
+    ///
+    /// If the initial query fails, we restart the database connection and attempt
+    /// if again.
+    #[tracing::instrument(skip_all, fields(%statement))]
+    pub async fn query_with_retry(
+        &mut self,
+        statement: &str,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> anyhow::Result<Vec<Row>> {
+        match self
+            .client
+            .query(statement, params)
+            .await
+            .context("failed to execute query")
+        {
+            Ok(rows) => Ok(rows),
+            Err(e) => {
+                error!(error = ?e, "postgres error: {e} -> retrying");
+
+                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
+                    .await
+                    .context("failed to connect to postgres file cache")?;
+                info!("successfully reconnected to postgres client");
+
+                // Replace the old client and attempt the query with the new one
+                self.client = client;
+                self.client
+                    .query(statement, params)
+                    .await
+                    .context("failed to execute query a second time")
+            }
+        }
+    }
+
+    /// Get the current size of the file cache.
+    #[tracing::instrument(skip_all)]
+    pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
+        self.query_with_retry(
+            // The file cache GUC variable is in MiB, but the conversion with
+            // pg_size_bytes means that the end result we get is in bytes.
+            "SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
+            &[],
+        )
+        .await
+        .context("failed to query pg for file cache size")?
+        .first()
+        .ok_or_else(|| anyhow!("file cache size query returned no rows"))?
+        // pg_size_bytes returns a bigint which is the same as an i64.
+        .try_get::<_, i64>(0)
+        // Since the size of the table is not negative, the cast is sound.
+        .map(|bytes| bytes as u64)
+        .context("failed to extract file cache size from query result")
+    }
+
+    /// Attempt to set the file cache size, returning the size it was actually
+    /// set to.
+    #[tracing::instrument(skip_all, fields(%num_bytes))]
+    pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
+        let max_bytes = self
+            // The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
+            // means that the end result we get is in bytes.
+            .query_with_retry(
+                "SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
+                &[],
+            )
+            .await
+            .context("failed to query pg for max file cache size")?
+            .first()
+            .ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
+            .try_get::<_, i64>(0)
+            .map(|bytes| bytes as u64)
+            .context("failed to extract max file cache size from query result")?;
+
+        let max_mb = max_bytes / MiB;
+        let num_mb = u64::min(num_bytes, max_bytes) / MiB;
+
+        let capped = if num_bytes > max_bytes {
+            " (capped by maximum size)"
+        } else {
+            ""
+        };
+
+        info!(
+            size = num_mb,
+            max = max_mb,
+            "updating file cache size {capped}",
+        );
+
+        // note: even though the normal ways to get the cache size produce values with trailing "MB"
+        // (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
+        // it expects to set the value is "integer number of MB" without trailing units.
+        // For some reason, this *really* wasn't working with normal arguments, so that's
+        // why we're constructing the query here.
+        self.client
+            .query(
+                &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
+                &[],
+            )
+            .await
+            .context("failed to change file cache size limit")?;
+
+        // must use pg_reload_conf to have the settings change take effect
+        self.client
+            .execute("SELECT pg_reload_conf();", &[])
+            .await
+            .context("failed to reload config")?;
+
+        Ok(num_mb * MiB)
+    }
+}
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -0,0 +1,218 @@
+#![cfg(target_os = "linux")]
+
+use anyhow::Context;
+use axum::{
+    extract::{ws::WebSocket, State, WebSocketUpgrade},
+    response::Response,
+};
+use axum::{routing::get, Router, Server};
+use clap::Parser;
+use futures::Future;
+use std::{fmt::Debug, time::Duration};
+use sysinfo::{RefreshKind, System, SystemExt};
+use tokio::{sync::broadcast, task::JoinHandle};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+use runner::Runner;
+
+// Code that interfaces with agent
+pub mod dispatcher;
+pub mod protocol;
+
+pub mod cgroup;
+pub mod filecache;
+pub mod runner;
+
+/// The vm-monitor is an autoscaling component started by compute_ctl.
+///
+/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
+/// memory pressure by making requests to the autoscaler-agent.
+#[derive(Debug, Parser)]
+pub struct Args {
+    /// The name of the cgroup we should monitor for memory.high events. This
+    /// is the cgroup that postgres should be running in.
+    #[arg(short, long)]
+    pub cgroup: Option<String>,
+
+    /// The connection string for the Postgres file cache we should manage.
+    #[arg(short, long)]
+    pub pgconnstr: Option<String>,
+
+    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
+    /// kernel's page cache), and therefore should not count against available memory.
+    //
+    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
+    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
+    // during the switch away from an in-memory file cache, we had to default to the previous
+    // behavior.
+    #[arg(long)]
+    pub file_cache_on_disk: bool,
+
+    /// The address we should listen on for connection requests. For the
+    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
+    #[arg(short, long)]
+    pub addr: String,
+}
+
+impl Args {
+    pub fn addr(&self) -> &str {
+        &self.addr
+    }
+}
+
+/// The number of bytes in one mebibyte.
+#[allow(non_upper_case_globals)]
+const MiB: u64 = 1 << 20;
+
+/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
+/// purposes. (Most calculations in this crate use bytes directly)
+pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
+    (bytes as f32) / (MiB as f32)
+}
+
+pub fn get_total_system_memory() -> u64 {
+    System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
+}
+
+/// Global app state for the Axum server
+#[derive(Debug, Clone)]
+pub struct ServerState {
+    /// Used to close old connections.
+    ///
+    /// When a new connection is made, we send a message signalling to the old
+    /// connection to close.
+    pub sender: broadcast::Sender<()>,
+
+    /// Used to cancel all spawned threads in the monitor.
+    pub token: CancellationToken,
+
+    // The CLI args
+    pub args: &'static Args,
+}
+
+/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
+///
+/// This is mainly meant to be called with futures that will be pending for a very
+/// long time, or are not mean to return. If it is not desirable for the future to
+/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
+/// be logged with `f`.
+pub fn spawn_with_cancel<T, F>(
+    token: CancellationToken,
+    f: F,
+    future: T,
+) -> JoinHandle<Option<T::Output>>
+where
+    T: Future + Send + 'static,
+    T::Output: Send + 'static,
+    F: FnOnce(&T::Output) + Send + 'static,
+{
+    tokio::spawn(async move {
+        tokio::select! {
+            _ = token.cancelled() => {
+                info!("received global kill signal");
+                None
+            }
+            res = future => {
+                f(&res);
+                Some(res)
+            }
+        }
+    })
+}
+
+/// The entrypoint to the binary.
+///
+/// Set up tracing, parse arguments, and start an http server.
+pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
+    // This channel is used to close old connections. When a new connection is
+    // made, we send a message signalling to the old connection to close.
+    let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
+
+    let app = Router::new()
+        // This route gets upgraded to a websocket connection. We only support
+        // one connection at a time, which we enforce by killing old connections
+        // when we receive a new one.
+        .route("/monitor", get(ws_handler))
+        .with_state(ServerState {
+            sender,
+            token,
+            args,
+        });
+
+    let addr = args.addr();
+    let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
+        .with_context(|| format!("failed to bind to {addr}"))?;
+
+    info!(addr, "server bound");
+
+    bound
+        .serve(app.into_make_service())
+        .await
+        .context("server exited")?;
+
+    Ok(())
+}
+
+/// Handles incoming websocket connections.
+///
+/// If we are already to connected to an agent, we kill that old connection
+/// and accept the new one.
+#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
+pub async fn ws_handler(
+    ws: WebSocketUpgrade,
+    State(ServerState {
+        sender,
+        token,
+        args,
+    }): State<ServerState>,
+) -> Response {
+    // Kill the old monitor
+    info!("closing old connection if there is one");
+    let _ = sender.send(());
+
+    // Start the new one. Wow, the cycle of death and rebirth
+    let closer = sender.subscribe();
+    ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
+}
+
+/// Starts the monitor. If startup fails or the monitor exits, an error will
+/// be logged and our internal state will be reset to allow for new connections.
+#[tracing::instrument(skip_all)]
+async fn start_monitor(
+    ws: WebSocket,
+    args: &Args,
+    kill: broadcast::Receiver<()>,
+    token: CancellationToken,
+) {
+    info!(
+        ?args,
+        "accepted new websocket connection -> starting monitor"
+    );
+    let timeout = Duration::from_secs(4);
+    let monitor = tokio::time::timeout(
+        timeout,
+        Runner::new(Default::default(), args, ws, kill, token),
+    )
+    .await;
+    let mut monitor = match monitor {
+        Ok(Ok(monitor)) => monitor,
+        Ok(Err(error)) => {
+            error!(?error, "failed to create monitor");
+            return;
+        }
+        Err(_) => {
+            error!(
+                ?timeout,
+                "creating monitor timed out (probably waiting to receive protocol range)"
+            );
+            return;
+        }
+    };
+    info!("connected to agent");
+
+    match monitor.run().await {
+        Ok(()) => info!("monitor was killed due to new connection"),
+        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+    }
+}
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -0,0 +1,241 @@
+//! Types representing protocols and actual agent-monitor messages.
+//!
+//! The pervasive use of serde modifiers throughout this module is to ease
+//! serialization on the go side. Because go does not have enums (which model
+//! messages well), it is harder to model messages, and we accomodate that with
+//! serde.
+//!
+//! *Note*: the agent sends and receives messages in different ways.
+//!
+//! The agent serializes messages in the form and then sends them. The use
+//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
+//! to determine how to deserialize `Content`.
+//! ```ignore
+//! struct {
+//!     Content any
+//!     Type    string
+//!     Id      uint64
+//! }
+//! ```
+//! and receives messages in the form:
+//! ```ignore
+//! struct {
+//!     {fields embedded}
+//!     Type string
+//!     Id   uint64
+//! }
+//! ```
+//! After reading the type field, the agent will decode the entire message
+//! again, this time into the correct type using the embedded fields.
+//! Because the agent cannot just extract the json contained in a certain field
+//! (it initially deserializes to `map[string]interface{}`), we keep the fields
+//! at the top level, so the entire piece of json can be deserialized into a struct,
+//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
+
+use core::fmt;
+use std::cmp;
+
+use serde::{de::Error, Deserialize, Serialize};
+
+/// A Message we send to the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct OutboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: OutboundMsgKind,
+    pub(crate) id: usize,
+}
+
+impl OutboundMsg {
+    pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
+        Self { inner, id }
+    }
+}
+
+/// The different underlying message types we can send to the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type")]
+pub enum OutboundMsgKind {
+    /// Indicates that the agent sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that we experienced an internal error while processing a message.
+    /// For example, if a cgroup operation fails while trying to handle an upscale,
+    /// we return `InternalError`.
+    InternalError { error: String },
+    /// Returned to the agent once we have finished handling an upscale. If the
+    /// handling was unsuccessful, an `InternalError` will get returned instead.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleConfirmation {},
+    /// Indicates to the monitor that we are urgently requesting resources.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleRequest {},
+    /// Returned to the agent once we have finished attempting to downscale. If
+    /// an error occured trying to do so, an `InternalError` will get returned instead.
+    /// However, if we are simply unsuccessful (for example, do to needing the resources),
+    /// that gets included in the `DownscaleResult`.
+    DownscaleResult {
+        // FIXME for the future (once the informant is deprecated)
+        // As of the time of writing, the agent/informant version of this struct is
+        // called api.DownscaleResult. This struct has uppercase fields which are
+        // serialized as such. Thus, we serialize using uppercase names so we don't
+        // have to make a breaking change to the agent<->informant protocol. Once
+        // the informant has been superseded by the monitor, we can add the correct
+        // struct tags to api.DownscaleResult without causing a breaking change,
+        // since we don't need to support the agent<->informant protocol anymore.
+        #[serde(rename = "Ok")]
+        ok: bool,
+        #[serde(rename = "Status")]
+        status: String,
+    },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// agent.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// A message received form the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct InboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: InboundMsgKind,
+    pub(crate) id: usize,
+}
+
+/// The different underlying message types we can receive from the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type", content = "content")]
+pub enum InboundMsgKind {
+    /// Indicates that the we sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that the informan experienced an internal error while processing
+    /// a message. For example, if it failed to request upsacle from the agent, it
+    /// would return an `InternalError`.
+    InternalError { error: String },
+    /// Indicates to us that we have been granted more resources. We should respond
+    /// with an `UpscaleConfirmation` when done handling the resources (increasins
+    /// file cache size, cgorup memory limits).
+    UpscaleNotification { granted: Resources },
+    /// A request to reduce resource usage. We should response with a `DownscaleResult`,
+    /// when done.
+    DownscaleRequest { target: Resources },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// agent.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// Represents the resources granted to a VM.
+#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
+// Renamed because the agent has multiple resources types:
+// `Resources` (milliCPU/memory slots)
+// `Allocation` (vCPU/bytes) <- what we correspond to
+#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
+pub struct Resources {
+    /// Number of vCPUs
+    pub(crate) cpu: f64,
+    /// Bytes of memory
+    pub(crate) mem: u64,
+}
+
+impl Resources {
+    pub fn new(cpu: f64, mem: u64) -> Self {
+        Self { cpu, mem }
+    }
+}
+
+pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
+pub struct ProtocolVersion(u8);
+
+impl ProtocolVersion {
+    /// Represents v1.0 of the agent<-> monitor protocol - the initial version
+    ///
+    /// Currently the latest version.
+    const V1_0: ProtocolVersion = ProtocolVersion(1);
+}
+
+impl fmt::Display for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            ProtocolVersion(0) => f.write_str("<invalid: zero>"),
+            ProtocolVersion::V1_0 => f.write_str("v1.0"),
+            other => write!(f, "<unknown: {other}>"),
+        }
+    }
+}
+
+/// A set of protocol bounds that determines what we are speaking.
+///
+/// These bounds are inclusive.
+#[derive(Debug)]
+pub struct ProtocolRange {
+    pub min: ProtocolVersion,
+    pub max: ProtocolVersion,
+}
+
+// Use a custom deserialize impl to ensure that `self.min <= self.max`
+impl<'de> Deserialize<'de> for ProtocolRange {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        struct InnerProtocolRange {
+            min: ProtocolVersion,
+            max: ProtocolVersion,
+        }
+        let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
+        if min > max {
+            Err(D::Error::custom(format!(
+                "min version = {min} is greater than max version = {max}",
+            )))
+        } else {
+            Ok(ProtocolRange { min, max })
+        }
+    }
+}
+
+impl fmt::Display for ProtocolRange {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.min == self.max {
+            f.write_fmt(format_args!("{}", self.max))
+        } else {
+            f.write_fmt(format_args!("{} to {}", self.min, self.max))
+        }
+    }
+}
+
+impl ProtocolRange {
+    /// Find the highest shared version between two `ProtocolRange`'s
+    pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
+        // We first have to make sure the ranges are overlapping. Once we know
+        // this, we can merge the ranges by taking the max of the mins and the
+        // mins of the maxes.
+        if self.min > other.max {
+            anyhow::bail!(
+                "Non-overlapping bounds: other.max = {} was less than self.min = {}",
+                other.max,
+                self.min,
+            )
+        } else if self.max < other.min {
+            anyhow::bail!(
+                "Non-overlappinng bounds: self.max = {} was less than other.min = {}",
+                self.max,
+                other.min
+            )
+        } else {
+            Ok(cmp::min(self.max, other.max))
+        }
+    }
+}
+
+/// We send this to the monitor after negotiating which protocol to use
+#[derive(Serialize, Debug)]
+#[serde(rename_all = "camelCase")]
+pub enum ProtocolResponse {
+    Error(String),
+    Version(ProtocolVersion),
+}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -0,0 +1,498 @@
+//! Exposes the `Runner`, which handles messages received from agent and
+//! sends upscale requests.
+//!
+//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
+//! all functionality.
+
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use std::{fmt::Debug, mem};
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::StreamExt;
+use tokio::sync::broadcast;
+use tokio::sync::mpsc;
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+
+use crate::cgroup::{CgroupWatcher, Sequenced};
+use crate::dispatcher::Dispatcher;
+use crate::filecache::{FileCacheConfig, FileCacheState};
+use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
+use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
+
+/// Central struct that interacts with agent, dispatcher, and cgroup to handle
+/// signals from the agent.
+#[derive(Debug)]
+pub struct Runner {
+    config: Config,
+    filecache: Option<FileCacheState>,
+    cgroup: Option<Arc<CgroupWatcher>>,
+    dispatcher: Dispatcher,
+
+    /// We "mint" new message ids by incrementing this counter and taking the value.
+    ///
+    /// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
+    /// by us vs the autoscaler-agent.
+    counter: usize,
+
+    last_upscale_request_at: Option<Instant>,
+
+    /// A signal to kill the main thread produced by `self.run()`. This is triggered
+    /// when the server receives a new connection. When the thread receives the
+    /// signal off this channel, it will gracefully shutdown.
+    kill: broadcast::Receiver<()>,
+}
+
+/// Configuration for a `Runner`
+#[derive(Debug)]
+pub struct Config {
+    /// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
+    /// handing out the rest to userspace. This value is the estimated difference between the
+    /// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
+    ///
+    /// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
+    /// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
+    ///
+    /// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
+    /// size, rather than the self-reported memory size, according to the kernel.
+    ///
+    /// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
+    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
+    /// should be removed once we have a better solution there.
+    sys_buffer_bytes: u64,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            sys_buffer_bytes: 100 * MiB,
+        }
+    }
+}
+
+impl Runner {
+    /// Create a new monitor.
+    #[tracing::instrument(skip_all, fields(?config, ?args))]
+    pub async fn new(
+        config: Config,
+        args: &Args,
+        ws: WebSocket,
+        kill: broadcast::Receiver<()>,
+        token: CancellationToken,
+    ) -> anyhow::Result<Runner> {
+        anyhow::ensure!(
+            config.sys_buffer_bytes != 0,
+            "invalid monitor Config: sys_buffer_bytes cannot be 0"
+        );
+
+        // *NOTE*: the dispatcher and cgroup manager talk through these channels
+        // so make sure they each get the correct half, nothing is droppped, etc.
+        let (notified_send, notified_recv) = mpsc::channel(1);
+        let (requesting_send, requesting_recv) = mpsc::channel(1);
+
+        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+            .await
+            .context("error creating new dispatcher")?;
+
+        let mut state = Runner {
+            config,
+            filecache: None,
+            cgroup: None,
+            dispatcher,
+            counter: 1, // NB: must be odd, see the comment about the field for more.
+            last_upscale_request_at: None,
+            kill,
+        };
+
+        // If we have both the cgroup and file cache integrations enabled, it's possible for
+        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+        // we *do* still want to determine the file cache size before setting the cgroup's
+        // memory.high, so it's not as simple as just swapping the order.
+        //
+        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+        // of a hacky solution, but helps with reliability.
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");
+
+            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+                .context("failed to create cgroup manager")?;
+
+            info!("temporarily unsetting memory.high");
+
+            // Temporarily un-set cgroup memory.high; see above.
+            cgroup
+                .unset_memory_high()
+                .context("failed to unset memory.high")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            let cgroup_clone = Arc::clone(&cgroup);
+            spawn_with_cancel(
+                token.clone(),
+                |_| error!("cgroup watcher terminated"),
+                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+            );
+
+            state.cgroup = Some(cgroup);
+        } else {
+            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+            // This allows us to poll it in `Monitor::run` regardless of whether we
+            // are managing a cgroup or not. If we don't forget it, all receives will
+            // immediately return an error because the sender is droped and it will
+            // claim all select! statements, effectively turning `Monitor::run` into
+            // `loop { fail to receive }`.
+            mem::forget(requesting_send);
+        }
+
+        let mut file_cache_reserved_bytes = 0;
+        let mem = get_total_system_memory();
+
+        // We need to process file cache initialization before cgroup initialization, so that the memory
+        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
+        // memory limits.
+        if let Some(connstr) = &args.pgconnstr {
+            info!("initializing file cache");
+            let config = match args.file_cache_on_disk {
+                true => FileCacheConfig::default_on_disk(),
+                false => FileCacheConfig::default_in_memory(),
+            };
+
+            let mut file_cache = FileCacheState::new(connstr, config, token)
+                .await
+                .context("failed to create file cache")?;
+
+            let size = file_cache
+                .get_file_cache_size()
+                .await
+                .context("error getting file cache size")?;
+
+            let new_size = file_cache.config.calculate_cache_size(mem);
+            info!(
+                initial = bytes_to_mebibytes(size),
+                new = bytes_to_mebibytes(new_size),
+                "setting initial file cache size",
+            );
+
+            // note: even if size == new_size, we want to explicitly set it, just
+            // to make sure that we have the permissions to do so
+            let actual_size = file_cache
+                .set_file_cache_size(new_size)
+                .await
+                .context("failed to set file cache size, possibly due to inadequate permissions")?;
+            if actual_size != new_size {
+                info!("file cache size actually got set to {actual_size}")
+            }
+            // Mark the resources given to the file cache as reserved, but only if it's in memory.
+            if !args.file_cache_on_disk {
+                file_cache_reserved_bytes = actual_size;
+            }
+
+            state.filecache = Some(file_cache);
+        }
+
+        if let Some(cgroup) = &state.cgroup {
+            let available = mem - file_cache_reserved_bytes;
+            let value = cgroup.config.calculate_memory_high_value(available);
+
+            info!(value, "setting memory.high");
+
+            cgroup
+                .set_memory_high_bytes(value)
+                .context("failed to set cgroup memory.high")?;
+        }
+
+        Ok(state)
+    }
+
+    /// Attempt to downscale filecache + cgroup
+    #[tracing::instrument(skip_all, fields(?target))]
+    pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
+        // Nothing to adjust
+        if self.cgroup.is_none() && self.filecache.is_none() {
+            info!("no action needed for downscale (no cgroup or file cache enabled)");
+            return Ok((
+                true,
+                "monitor is not managing cgroup or file cache".to_string(),
+            ));
+        }
+
+        let requested_mem = target.mem;
+        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
+        let expected_file_cache_mem_usage = self
+            .filecache
+            .as_ref()
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
+        let mut new_cgroup_mem_high = 0;
+        if let Some(cgroup) = &self.cgroup {
+            new_cgroup_mem_high = cgroup
+                .config
+                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+
+            let current = cgroup
+                .current_memory_usage()
+                .context("failed to fetch cgroup memory")?;
+
+            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+                let status = format!(
+                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
+                    "calculated memory.high too low",
+                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    bytes_to_mebibytes(current),
+                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                );
+
+                info!(status, "discontinuing downscale");
+
+                return Ok((false, status));
+            }
+        }
+
+        // The downscaling has been approved. Downscale the file cache, then the cgroup.
+        let mut status = vec![];
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_file_cache_mem_usage)
+                .await
+                .context("failed to set file cache size")?;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
+            }
+            let message = format!(
+                "set file cache size to {} MiB (in memory = {})",
+                bytes_to_mebibytes(actual_usage),
+                file_cache.config.in_memory,
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+
+            if file_cache_mem_usage != expected_file_cache_mem_usage {
+                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            }
+
+            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+            // since it is properly initialized in the previous cgroup if let block
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
+
+            let message = format!(
+                "set cgroup memory.high to {} MiB, of new max {} MiB",
+                bytes_to_mebibytes(new_cgroup_mem_high),
+                bytes_to_mebibytes(available_memory)
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        // TODO: make this status thing less jank
+        let status = status.join("; ");
+        Ok((true, status))
+    }
+
+    /// Handle new resources
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
+        if self.filecache.is_none() && self.cgroup.is_none() {
+            info!("no action needed for upscale (no cgroup or file cache enabled)");
+            return Ok(());
+        }
+
+        let new_mem = resources.mem;
+        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
+
+        // Get the file cache's expected contribution to the memory usage
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
+            info!(
+                target = bytes_to_mebibytes(expected_usage),
+                total = bytes_to_mebibytes(new_mem),
+                "updating file cache size",
+            );
+
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_usage)
+                .await
+                .context("failed to set file cache size")?;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
+            }
+
+            if actual_usage != expected_usage {
+                warn!(
+                    "file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
+                    bytes_to_mebibytes(expected_usage),
+                    bytes_to_mebibytes(actual_usage)
+                )
+            }
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            info!(
+                target = bytes_to_mebibytes(new_cgroup_mem_high),
+                total = bytes_to_mebibytes(new_mem),
+                name = cgroup.path(),
+                "updating cgroup memory.high",
+            );
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
+        }
+
+        Ok(())
+    }
+
+    /// Take in a message and perform some action, such as downscaling or upscaling,
+    /// and return a message to be send back.
+    #[tracing::instrument(skip_all, fields(%id, message = ?inner))]
+    pub async fn process_message(
+        &mut self,
+        InboundMsg { inner, id }: InboundMsg,
+    ) -> anyhow::Result<Option<OutboundMsg>> {
+        match inner {
+            InboundMsgKind::UpscaleNotification { granted } => {
+                self.handle_upscale(granted)
+                    .await
+                    .context("failed to handle upscale")?;
+                self.dispatcher
+                    .notify_upscale(Sequenced::new(granted))
+                    .await
+                    .context("failed to notify notify cgroup of upscale")?;
+                Ok(Some(OutboundMsg::new(
+                    OutboundMsgKind::UpscaleConfirmation {},
+                    id,
+                )))
+            }
+            InboundMsgKind::DownscaleRequest { target } => self
+                .try_downscale(target)
+                .await
+                .context("failed to downscale")
+                .map(|(ok, status)| {
+                    Some(OutboundMsg::new(
+                        OutboundMsgKind::DownscaleResult { ok, status },
+                        id,
+                    ))
+                }),
+            InboundMsgKind::InvalidMessage { error } => {
+                warn!(
+                    %error, id, "received notification of an invalid message we sent"
+                );
+                Ok(None)
+            }
+            InboundMsgKind::InternalError { error } => {
+                warn!(error, id, "agent experienced an internal error");
+                Ok(None)
+            }
+            InboundMsgKind::HealthCheck {} => {
+                Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
+            }
+        }
+    }
+
+    // TODO: don't propagate errors, probably just warn!?
+    #[tracing::instrument(skip_all)]
+    pub async fn run(&mut self) -> anyhow::Result<()> {
+        info!("starting dispatcher");
+        loop {
+            tokio::select! {
+                signal = self.kill.recv() => {
+                    match signal {
+                        Ok(()) => return Ok(()),
+                        Err(e) => bail!("failed to receive kill signal: {e}")
+                    }
+                }
+                // we need to propagate an upscale request
+                request = self.dispatcher.request_upscale_events.recv() => {
+                    if request.is_none() {
+                        bail!("failed to listen for upscale event from cgroup")
+                    }
+
+                    // If it's been less than 1 second since the last time we requested upscaling,
+                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
+                    // ~1k times per second).
+                    if let Some(t) = self.last_upscale_request_at {
+                        let elapsed = t.elapsed();
+                        if elapsed < Duration::from_secs(1) {
+                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
+                            continue;
+                        }
+                    }
+
+                    self.last_upscale_request_at = Some(Instant::now());
+
+                    info!("cgroup asking for upscale; forwarding request");
+                    self.counter += 2; // Increment, preserving parity (i.e. keep the
+                                       // counter odd). See the field comment for more.
+                    self.dispatcher
+                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
+                        .await
+                        .context("failed to send message")?;
+                }
+                // there is a message from the agent
+                msg = self.dispatcher.source.next() => {
+                    if let Some(msg) = msg {
+                        // Don't use 'message' as a key as the string also uses
+                        // that for its key
+                        info!(?msg, "received message");
+                        match msg {
+                            Ok(msg) => {
+                                let message: InboundMsg = match msg {
+                                    Message::Text(text) => {
+                                        serde_json::from_str(&text).context("failed to deserialize text message")?
+                                    }
+                                    other => {
+                                        warn!(
+                                            // Don't use 'message' as a key as the
+                                            // string also uses that for its key
+                                            msg = ?other,
+                                            "agent should only send text messages but received different type"
+                                        );
+                                        continue
+                                    },
+                                };
+
+                                let out = match self.process_message(message.clone()).await {
+                                    Ok(Some(out)) => out,
+                                    Ok(None) => continue,
+                                    Err(e) => {
+                                        let error = e.to_string();
+                                        warn!(?error, "error handling message");
+                                        OutboundMsg::new(
+                                            OutboundMsgKind::InternalError {
+                                                error
+                                            },
+                                            message.id
+                                        )
+                                    }
+                                };
+
+                                self.dispatcher
+                                    .send(out)
+                                    .await
+                                    .context("failed to send message")?;
+                            }
+                            Err(e) => warn!("{e}"),
+                        }
+                    } else {
+                        anyhow::bail!("dispatcher connection closed")
+                    }
+                }
+            }
+        }
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -51,6 +51,7 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
+smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
 tokio-tar.workspace = true
@@ -79,11 +80,11 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tempfile.workspace = true

 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
-tempfile.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,6 +3,9 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::task_mgr::TaskKind;
+use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
@@ -10,7 +13,7 @@ use std::{fs, path::Path, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
-use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
+use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
@@ -95,9 +98,9 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0)?;
+async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -124,6 +127,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
                prev_key = Some(curr.next());
                true
            },
+            ctx,
        )
        .await?;
    let mut holes = heap.into_vec();
@@ -134,6 +138,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
 pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
    pageserver::virtual_file::init(10);
@@ -142,12 +147,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
    let mut total_excess_layers = 0usize;
-    for tenant in fs::read_dir(storage_path.join("tenants"))? {
+    for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? {
        let tenant = tenant?;
        if !tenant.file_type()?.is_dir() {
            continue;
        }
-        for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+        for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
            let timeline = timeline?;
            if !timeline.file_type()?.is_dir() {
                continue;
@@ -162,7 +167,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -2,9 +2,12 @@ use std::path::{Path, PathBuf};

 use anyhow::Result;
 use clap::Subcommand;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
    repository::{Key, KEY_SIZE},
@@ -43,14 +46,12 @@ pub(crate) enum LayerCmd {
    },
 }

-async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
-    use pageserver::tenant::block_io::BlockReader;
-
+async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0)?;
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -68,11 +69,12 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
                all.push((curr, BlobRef(value_offset)));
                true
            },
+            ctx,
        )
        .await?;
-    let cursor = BlockCursor::new(&file);
+    let cursor = BlockCursor::new_fileblockreader(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos()).await?;
+        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
@@ -80,15 +82,16 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
 }

 pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    match cmd {
        LayerCmd::List { path } => {
-            for tenant in fs::read_dir(path.join("tenants"))? {
+            for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
                let tenant = tenant?;
                if !tenant.file_type()?.is_dir() {
                    continue;
                }
                println!("tenant {}", tenant.file_name().to_string_lossy());
-                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+                for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
                    let timeline = timeline?;
                    if !timeline.file_type()?.is_dir() {
                        continue;
@@ -103,9 +106,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            timeline,
        } => {
            let timeline_path = path
-                .join("tenants")
+                .join(TENANTS_SEGMENT_NAME)
                .join(tenant)
-                .join("timelines")
+                .join(TIMELINES_SEGMENT_NAME)
                .join(timeline);
            let mut idx = 0;
            for layer in fs::read_dir(timeline_path)? {
@@ -154,7 +157,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path()).await?;
+                            read_delta_file(layer.path(), &ctx).await?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -25,6 +25,7 @@ use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

+use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
@@ -323,14 +324,25 @@ where
                .timeline
                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                .await?;
-            ensure!(img.len() == 512);
+
+            ensure!(
+                img.len()
+                    == dispatch_pgversion!(
+                        self.timeline.pg_version,
+                        pgv::bindings::SIZEOF_RELMAPFILE
+                    )
+            );
+
            Some(img)
        } else {
            None
        };

        if spcnode == GLOBALTABLESPACE_OID {
-            let pg_version_str = self.timeline.pg_version.to_string();
+            let pg_version_str = match self.timeline.pg_version {
+                14 | 15 => self.timeline.pg_version.to_string(),
+                ver => format!("{ver}\x0A"),
+            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
            self.ar.append(&header, pg_version_str.as_bytes()).await?;

@@ -374,7 +386,10 @@ where
            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);

-                let pg_version_str = self.timeline.pg_version.to_string();
+                let pg_version_str = match self.timeline.pg_version {
+                    14 | 15 => self.timeline.pg_version.to_string(),
+                    ver => format!("{ver}\x0A"),
+                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
                self.ar.append(&header, pg_version_str.as_bytes()).await?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -20,6 +21,7 @@ use metrics::set_build_info_metric;
 use pageserver::{
    config::{defaults::*, PageServerConf},
    context::{DownloadBehavior, RequestContext},
+    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
    task_mgr::TaskKind,
    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -346,9 +348,22 @@ fn start_pageserver(
        }
    };

+    // Top-level cancellation token for the process
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // Set up deletion queue
+    let (deletion_queue, deletion_workers) = DeletionQueue::new(
+        remote_storage.clone(),
+        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        conf,
+    );
+    if let Some(deletion_workers) = deletion_workers {
+        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+    }
+
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint("initial", "Starting loading tenants");
@@ -379,15 +394,16 @@ fn start_pageserver(
    };

    // Scan the local 'tenants/' directory and start loading the tenants
-    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
+    let deletion_queue_client = deletion_queue.new_client();
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
+            deletion_queue_client,
        },
        order,
+        shutdown_pageserver.clone(),
    ))?;

    BACKGROUND_RUNTIME.spawn({
@@ -476,16 +492,20 @@ fn start_pageserver(
    {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

-        let router = http::make_router(
-            conf,
-            launch_ts,
-            http_auth,
-            broker_client.clone(),
-            remote_storage,
-            disk_usage_eviction_state,
-        )?
-        .build()
-        .map_err(|err| anyhow!(err))?;
+        let router_state = Arc::new(
+            http::routes::State::new(
+                conf,
+                http_auth.clone(),
+                remote_storage.clone(),
+                broker_client.clone(),
+                disk_usage_eviction_state,
+                deletion_queue.new_client(),
+            )
+            .context("Failed to initialize router state")?,
+        );
+        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
+            .build()
+            .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
@@ -514,6 +534,9 @@ fn start_pageserver(
            // creates a child context with the right DownloadBehavior.
            DownloadBehavior::Error,
        );
+
+        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
+
        task_mgr::spawn(
            crate::BACKGROUND_RUNTIME.handle(),
            TaskKind::MetricsCollection,
@@ -540,6 +563,7 @@ fn start_pageserver(
                    conf.cached_metric_collection_interval,
                    conf.synthetic_size_calculation_interval,
                    conf.id,
+                    local_disk_storage,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
@@ -603,7 +627,12 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            ));
            unreachable!()
        }
    })
@@ -615,7 +644,7 @@ fn create_remote_storage_client(
    let config = if let Some(config) = &conf.remote_storage_config {
        config
    } else {
-        // No remote storage configured.
+        tracing::warn!("no remote storage configured, this is a deprecated configuration");
        return Ok(None);
    };

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,6 +11,7 @@ use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
+use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -32,7 +33,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
+    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
@@ -63,7 +65,7 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
+    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -72,7 +74,7 @@ pub mod defaults {
    /// Default built-in configuration file.
    ///
    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
-        r###"
+        r#"
 # Initial configuration file created by 'pageserver --init'
 #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
 #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
@@ -117,7 +119,7 @@ pub mod defaults {

 [remote_storage]

-"###
+"#
    );
 }

@@ -204,6 +206,11 @@ pub struct PageServerConf {
    /// has it's initial logical size calculated. Not running background tasks for some seconds is
    /// not terrible.
    pub background_task_maximum_delay: Duration,
+
+    pub control_plane_api: Option<Url>,
+
+    /// JWT token for use with the control plane API.
+    pub control_plane_api_token: Option<SecretString>,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -278,6 +285,9 @@ struct PageServerConfigBuilder {
    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,

    background_task_maximum_delay: BuilderValue<Duration>,
+
+    control_plane_api: BuilderValue<Option<Url>>,
+    control_plane_api_token: BuilderValue<Option<SecretString>>,
 }

 impl Default for PageServerConfigBuilder {
@@ -340,6 +350,9 @@ impl Default for PageServerConfigBuilder {
                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
            )
            .unwrap()),
+
+            control_plane_api: Set(None),
+            control_plane_api_token: Set(None),
        }
    }
 }
@@ -468,6 +481,10 @@ impl PageServerConfigBuilder {
        self.background_task_maximum_delay = BuilderValue::Set(delay);
    }

+    pub fn control_plane_api(&mut self, api: Option<Url>) {
+        self.control_plane_api = BuilderValue::Set(api)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -553,6 +570,12 @@ impl PageServerConfigBuilder {
            background_task_maximum_delay: self
                .background_task_maximum_delay
                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
+            control_plane_api: self
+                .control_plane_api
+                .ok_or(anyhow!("missing control_plane_api"))?,
+            control_plane_api_token: self
+                .control_plane_api_token
+                .ok_or(anyhow!("missing control_plane_api_token"))?,
        })
    }
 }
@@ -563,7 +586,28 @@ impl PageServerConf {
    //

    pub fn tenants_path(&self) -> PathBuf {
-        self.workdir.join("tenants")
+        self.workdir.join(TENANTS_SEGMENT_NAME)
+    }
+
+    pub fn deletion_prefix(&self) -> PathBuf {
+        self.workdir.join("deletion")
+    }
+
+    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix()
+            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
+    }
+
+    pub fn deletion_header_path(&self) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
    }

    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
@@ -643,23 +687,6 @@ impl PageServerConf {
            .join(METADATA_FILE_NAME)
    }

-    /// Files on the remote storage are stored with paths, relative to the workdir.
-    /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
-    ///
-    /// Errors if the path provided does not start from pageserver's workdir.
-    pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
-        local_path
-            .strip_prefix(&self.workdir)
-            .context("Failed to strip workdir prefix")
-            .and_then(RemotePath::new)
-            .with_context(|| {
-                format!(
-                    "Failed to resolve remote part of path {:?} for base {:?}",
-                    local_path, self.workdir
-                )
-            })
-    }
-
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
        remote_path.with_base(&self.workdir)
@@ -671,26 +698,18 @@ impl PageServerConf {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 => Ok(path.join(format!("v{pg_version}"))),
-            15 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    /// Parse a configuration file (pageserver.toml) into a PageServerConf struct,
@@ -758,6 +777,14 @@ impl PageServerConf {
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
+                "control_plane_api" => {
+                    let parsed = parse_toml_string(key, item)?;
+                    if parsed.is_empty() {
+                        builder.control_plane_api(None)
+                    } else {
+                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+                    }
+                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -926,6 +953,8 @@ impl PageServerConf {
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
            background_task_maximum_delay: Duration::ZERO,
+            control_plane_api: None,
+            control_plane_api_token: None,
        }
    }
 }
@@ -1149,6 +1178,8 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: humantime::parse_duration(
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
+                control_plane_api: None,
+                control_plane_api_token: None
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1204,6 +1235,8 @@ background_task_maximum_delay = '334 s'
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
+                control_plane_api: None,
+                control_plane_api_token: None
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,188 +1,54 @@
-//!
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
-//! Cache metrics to send only the updated ones.
-//!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
-use anyhow;
-use chrono::{DateTime, Utc};
-use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
+use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
 use reqwest::Url;
-use serde::Serialize;
-use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tracing::*;
-use utils::id::{NodeId, TenantId, TimelineId};
-use utils::lsn::Lsn;
+use utils::id::NodeId;
+
+mod metrics;
+use metrics::MetricsKey;
+mod disk_cache;
+mod upload;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

-#[serde_as]
-#[derive(Serialize, Debug, Clone, Copy)]
-struct Ids {
-    #[serde_as(as = "DisplayFromStr")]
-    tenant_id: TenantId,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    timeline_id: Option<TimelineId>,
-}
+/// Basically a key-value pair, but usually in a Vec except for [`Cache`].
+///
+/// This is as opposed to `consumption_metrics::Event` which is the externally communicated form.
+/// Difference is basically the missing idempotency key, which lives only for the duration of
+/// upload attempts.
+type RawMetric = (MetricsKey, (EventType, u64));

-/// Key that uniquely identifies the object, this metric describes.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-struct MetricsKey {
-    tenant_id: TenantId,
-    timeline_id: Option<TimelineId>,
-    metric: &'static str,
-}
-
-impl MetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(MetricsKey);
-
-impl AbsoluteValueFactory {
-    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(MetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a MetricsKey
-impl MetricsKey {
-    /// Absolute value of [`Timeline::get_last_record_lsn`].
-    ///
-    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
-    /// previously sent, starting from the previously sent incremental time range ending at the
-    /// latest absolute measurement.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            // the name here is correctly about data not size, because that is what is wanted by
-            // downstream pipeline
-            metric: "written_data_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    /// Exact [`Timeline::get_current_logical_size`].
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::remote_size`]
-    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
-    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
-}
+/// Caches the [`RawMetric`]s
+///
+/// In practice, during startup, last sent values are stored here to be used in calculating new
+/// ones. After successful uploading, the cached values are updated to cache. This used to be used
+/// for deduplication, but that is no longer needed.
+type Cache = HashMap<MetricsKey, (EventType, u64)>;

 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
-    cached_metric_collection_interval: Duration,
+    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
+    local_disk_storage: PathBuf,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-    info!("starting collect_metrics");
+    if _cached_metric_collection_interval != Duration::ZERO {
+        tracing::warn!(
+            "cached_metric_collection_interval is no longer used, please set it to zero."
+        )
+    }

    // spin up background worker that caclulates tenant sizes
    let worker_ctx =
@@ -202,543 +68,216 @@ pub async fn collect_metrics(
        },
    );

+    let path: Arc<PathBuf> = Arc::new(local_disk_storage);
+
+    let cancel = task_mgr::shutdown_token();
+
+    let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
+
+    let mut cached_metrics = tokio::select! {
+        _ = cancel.cancelled() => return Ok(()),
+        ret = restore_and_reschedule => ret,
+    };
+
    // define client here to reuse it for all requests
    let client = reqwest::ClientBuilder::new()
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
-    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
-
-    loop {
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => {
-                info!("collect_metrics received cancellation request");
-                return Ok(());
-            },
-            tick_at = ticker.tick() => {
-
-                // send cached metrics every cached_metric_collection_interval
-                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
-
-                if send_cached {
-                    prev_iteration_time = std::time::Instant::now();
-                }
-
-                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    metric_collection_interval,
-                    "consumption_metrics_collect_metrics",
-                );
-            }
-        }
-    }
-}
-
-/// One iteration of metrics collection
-///
-/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
-/// Cache metrics to avoid sending the same metrics multiple times.
-///
-/// This function handles all errors internally
-/// and doesn't break iteration if just one tenant fails.
-///
-/// TODO
-/// - refactor this function (chunking+sending part) to reuse it in proxy module;
-async fn collect_metrics_iteration(
-    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
-    metric_collection_endpoint: &reqwest::Url,
-    node_id: NodeId,
-    ctx: &RequestContext,
-    send_cached: bool,
-) {
-    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
-    trace!(
-        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
-    // get list of tenants
-    let tenants = match mgr::list_tenants().await {
-        Ok(tenants) => tenants,
-        Err(err) => {
-            error!("failed to list tenants: {:?}", err);
-            return;
-        }
-    };
-
-    // iterate through list of Active tenants and collect metrics
-    for (tenant_id, tenant_state) in tenants {
-        if tenant_state != TenantState::Active {
-            continue;
-        }
-
-        let tenant = match mgr::get_tenant(tenant_id, true).await {
-            Ok(tenant) => tenant,
-            Err(err) => {
-                // It is possible that tenant was deleted between
-                // `list_tenants` and `get_tenant`, so just warn about it.
-                warn!("failed to get tenant {tenant_id:?}: {err:?}");
-                continue;
-            }
-        };
-
-        let mut tenant_resident_size = 0;
-
-        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines() {
-            // collect per-timeline metrics only for active timelines
-
-            let timeline_id = timeline.timeline_id;
-
-            match TimelineSnapshot::collect(&timeline, ctx) {
-                Ok(Some(snap)) => {
-                    snap.to_metrics(
-                        tenant_id,
-                        timeline_id,
-                        Utc::now(),
-                        &mut current_metrics,
-                        cached_metrics,
-                    );
-                }
-                Ok(None) => {}
-                Err(e) => {
-                    error!(
-                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
-                        timeline.timeline_id
-                    );
-                    continue;
-                }
-            }
-
-            tenant_resident_size += timeline.resident_physical_size();
-        }
-
-        current_metrics
-            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
-
-        current_metrics
-            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
-
-        // Note that this metric is calculated in a separate bgworker
-        // Here we only use cached value, which may lag behind the real latest one
-        let synthetic_size = tenant.cached_synthetic_size();
-
-        if synthetic_size != 0 {
-            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics
-                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
-        }
-    }
-
-    // Filter metrics, unless we want to send all metrics, including cached ones.
-    // See: https://github.com/neondatabase/neon/issues/3485
-    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
-        });
-    }
-
-    if current_metrics.is_empty() {
-        trace!("no new metrics to send");
-        return;
-    }
-
-    // Send metrics.
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
-    let chunks = current_metrics.chunks(CHUNK_SIZE);
-
-    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

    let node_id = node_id.to_string();

-    for chunk in chunks {
-        chunk_to_send.clear();
+    // reminder: ticker is ready immediatedly
+    let mut ticker = tokio::time::interval(metric_collection_interval);

-        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
-            metric: curr_key.metric,
-            idempotency_key: idempotency_key(&node_id),
-            value: *curr_val,
-            extra: Ids {
-                tenant_id: curr_key.tenant_id,
-                timeline_id: curr_key.timeline_id,
-            },
-        }));
+    loop {
+        let tick_at = tokio::select! {
+            _ = cancel.cancelled() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

-        const MAX_RETRIES: u32 = 3;
+        // these are point in time, with variable "now"
+        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        for attempt in 0..MAX_RETRIES {
-            let res = client
-                .post(metric_collection_endpoint.clone())
-                .json(&EventChunk {
-                    events: (&chunk_to_send).into(),
-                })
-                .send()
-                .await;
+        if metrics.is_empty() {
+            continue;
+        }

-            match res {
-                Ok(res) => {
-                    if res.status().is_success() {
-                        // update cached metrics after they were sent successfully
-                        for (curr_key, curr_val) in chunk.iter() {
-                            cached_metrics.insert(curr_key.clone(), *curr_val);
-                        }
-                    } else {
-                        error!("metrics endpoint refused the sent metrics: {:?}", res);
-                        for metric in chunk_to_send
-                            .iter()
-                            .filter(|metric| metric.value > (1u64 << 40))
-                        {
-                            // Report if the metric value is suspiciously large
-                            error!("potentially abnormal metric value: {:?}", metric);
-                        }
-                    }
-                    break;
+        let metrics = Arc::new(metrics);
+
+        // why not race cancellation here? because we are one of the last tasks, and if we are
+        // already here, better to try to flush the new values.
+
+        let flush = async {
+            match disk_cache::flush_metrics_to_disk(&metrics, &path).await {
+                Ok(()) => {
+                    tracing::debug!("flushed metrics to disk");
                }
-                Err(err) if err.is_timeout() => {
-                    error!(attempt, "timeout sending metrics, retrying immediately");
-                    continue;
-                }
-                Err(err) => {
-                    error!(attempt, ?err, "failed to send metrics");
-                    break;
+                Err(e) => {
+                    // idea here is that if someone creates a directory as our path, then they
+                    // might notice it from the logs before shutdown and remove it
+                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
+        };
+
+        let upload = async {
+            let res = upload::upload_metrics(
+                &client,
+                metric_collection_endpoint,
+                &cancel,
+                &node_id,
+                &metrics,
+                &mut cached_metrics,
+            )
+            .await;
+            if let Err(e) = res {
+                // serialization error which should never happen
+                tracing::error!("failed to upload due to {e:#}");
+            }
+        };
+
+        // let these run concurrently
+        let (_, _) = tokio::join!(flush, upload);
+
+        crate::tenant::tasks::warn_when_period_overrun(
+            tick_at.elapsed(),
+            metric_collection_interval,
+            "consumption_metrics_collect_metrics",
+        );
+    }
+}
+
+/// Called on the first iteration in an attempt to join the metric uploading schedule from previous
+/// pageserver session. Pageserver is supposed to upload at intervals regardless of restarts.
+///
+/// Cancellation safe.
+async fn restore_and_reschedule(
+    path: &Arc<PathBuf>,
+    metric_collection_interval: Duration,
+) -> Cache {
+    let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
+        Ok(found_some) => {
+            // there is no min needed because we write these sequentially in
+            // collect_all_metrics
+            let earlier_metric_at = found_some
+                .iter()
+                .map(|(_, (et, _))| et.recorded_at())
+                .copied()
+                .next();
+
+            let cached = found_some.into_iter().collect::<Cache>();
+
+            (cached, earlier_metric_at)
+        }
+        Err(e) => {
+            use std::io::{Error, ErrorKind};
+
+            let root = e.root_cause();
+            let maybe_ioerr = root.downcast_ref::<Error>();
+            let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound);
+
+            if !is_not_found {
+                tracing::info!("failed to read any previous metrics from {path:?}: {e:#}");
+            }
+
+            (HashMap::new(), None)
+        }
+    };
+
+    if let Some(earlier_metric_at) = earlier_metric_at {
+        let earlier_metric_at: SystemTime = earlier_metric_at.into();
+
+        let error = reschedule(earlier_metric_at, metric_collection_interval).await;
+
+        if let Some(error) = error {
+            if error.as_secs() >= 60 {
+                tracing::info!(
+                    error_ms = error.as_millis(),
+                    "startup scheduling error due to restart"
+                )
+            }
        }
    }
+
+    cached
 }

-/// Internal type to make timeline metric production testable.
-///
-/// As this value type contains all of the information needed from a timeline to produce the
-/// metrics, it can easily be created with different values in test.
-struct TimelineSnapshot {
-    loaded_at: (Lsn, SystemTime),
-    last_record_lsn: Lsn,
-    current_exact_logical_size: Option<u64>,
-}
+async fn reschedule(
+    earlier_metric_at: SystemTime,
+    metric_collection_interval: Duration,
+) -> Option<Duration> {
+    let now = SystemTime::now();
+    match now.duration_since(earlier_metric_at) {
+        Ok(from_last_send) if from_last_send < metric_collection_interval => {
+            let sleep_for = metric_collection_interval - from_last_send;

-impl TimelineSnapshot {
-    /// Collect the metrics from an actual timeline.
-    ///
-    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    fn collect(
-        t: &Arc<crate::tenant::Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Self>> {
-        use anyhow::Context;
+            let deadline = std::time::Instant::now() + sleep_for;

-        if !t.is_active() {
-            // no collection for broken or stopping needed, we will still keep the cached values
-            // though at the caller.
-            Ok(None)
-        } else {
-            let loaded_at = t.loaded_at;
-            let last_record_lsn = t.get_last_record_lsn();
+            tokio::time::sleep_until(deadline.into()).await;

-            let current_exact_logical_size = {
-                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
-                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
-                }
-            };
+            let now = std::time::Instant::now();

-            Ok(Some(TimelineSnapshot {
-                loaded_at,
-                last_record_lsn,
-                current_exact_logical_size,
-            }))
-        }
-    }
-
-    /// Produce the timeline consumption metrics into the `metrics` argument.
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        now: DateTime<Utc>,
-        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
-        cache: &HashMap<MetricsKey, (EventType, u64)>,
-    ) {
-        let timeline_written_size = u64::from(self.last_record_lsn);
-
-        let (key, written_size_now) =
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
-
-        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-        // features might change this.
-
-        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
-
-        // use this when available, because in a stream of incremental values, it will be
-        // accurate where as when last_record_lsn stops moving, we will only cache the last
-        // one of those.
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
-
-        // by default, use the last sent written_size as the basis for
-        // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
-                // use the prev time from our last incremental update, or default to latest
-                // absolute update on the first round.
-                let prev_at = prev_at
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
-            })
-            .unwrap_or_else(|| {
-                // if we don't have a previous point of comparison, compare to the load time
-                // lsn.
-                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
-                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-            });
-
-        // written_size_bytes_delta
-        metrics.extend(
-            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                let up_to = written_size_now
-                    .0
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                Some(key_value)
+            // executor threads might be busy, add extra measurements
+            Some(if now < deadline {
+                deadline - now
            } else {
-                None
-            },
-        );
-
-        // written_size
-        metrics.push((key, written_size_now));
-
-        if let Some(size) = self.current_exact_logical_size {
-            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+                now - deadline
+            })
+        }
+        Ok(from_last_send) => Some(from_last_send.saturating_sub(metric_collection_interval)),
+        Err(_) => {
+            tracing::warn!(
+                ?now,
+                ?earlier_metric_at,
+                "oldest recorded metric is in future; first values will come out with inconsistent timestamps"
+            );
+            earlier_metric_at.duration_since(now).ok()
        }
    }
 }

 /// Caclculate synthetic size for each active tenant
-pub async fn calculate_synthetic_size_worker(
+async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");

+    // reminder: ticker is ready immediatedly
    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
+    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => {
-                return Ok(());
-            },
-            tick_at = ticker.tick() => {
+        let tick_at = tokio::select! {
+            _ = task_mgr::shutdown_watcher() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

-                let tenants = match mgr::list_tenants().await {
-                    Ok(tenants) => tenants,
-                    Err(e) => {
-                        warn!("cannot get tenant list: {e:#}");
-                        continue;
-                    }
-                };
-                // iterate through list of Active tenants and collect metrics
-                for (tenant_id, tenant_state) in tenants {
+        let tenants = match mgr::list_tenants().await {
+            Ok(tenants) => tenants,
+            Err(e) => {
+                warn!("cannot get tenant list: {e:#}");
+                continue;
+            }
+        };

-                    if tenant_state != TenantState::Active {
-                        continue;
-                    }
-
-                    if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
-                    {
-                        if let Err(e) = tenant.calculate_synthetic_size(
-                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
-                            ctx).await {
-                            error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
-                        }
-                    }
+        for (tenant_id, tenant_state) in tenants {
+            if tenant_state != TenantState::Active {
+                continue;
+            }

+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
+                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    synthetic_size_calculation_interval,
-                    "consumption_metrics_synthetic_size_worker",
-                );
            }
        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-
-    use std::time::SystemTime;
-    use utils::{
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::consumption_metrics::MetricsKey;
-
-    use super::TimelineSnapshot;
-    use chrono::{DateTime, Utc};
-
-    #[test]
-    fn startup_collected_timeline_metrics_before_advancing() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::new();
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, SystemTime::now()),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        let now = DateTime::<Utc>::from(SystemTime::now());
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    snap.loaded_at.1.into(),
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_second_round() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id)
-                    .from_previous_up_to(before, now, 0),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, just_before, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let just_before = DateTime::<Utc>::from(just_before);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            // at t=before was the last time the last_record_lsn changed
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
-            // end time of this event is used for the next ones
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                before,
-                just_before,
-                0,
-            ),
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    just_before,
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
-        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
-        times[0] = std::time::SystemTime::now();
-        for behind in 1..N {
-            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
-        }
-
-        times
+
+        crate::tenant::tasks::warn_when_period_overrun(
+            tick_at.elapsed(),
+            synthetic_size_calculation_interval,
+            "consumption_metrics_synthetic_size_worker",
+        );
    }
 }
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -0,0 +1,117 @@
+use anyhow::Context;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use super::RawMetric;
+
+pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
+    // do not add context to each error, callsite will log with full path
+    let span = tracing::Span::current();
+    tokio::task::spawn_blocking(move || {
+        let _e = span.entered();
+
+        if let Some(parent) = path.parent() {
+            if let Err(e) = scan_and_delete_with_same_prefix(&path) {
+                tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}");
+            }
+        }
+
+        let mut file = std::fs::File::open(&*path)?;
+        let reader = std::io::BufReader::new(&mut file);
+        anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
+    })
+    .await
+    .context("read metrics join error")
+    .and_then(|x| x)
+}
+
+fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
+    let it = std::fs::read_dir(path.parent().expect("caller checked"))?;
+
+    let prefix = path.file_name().expect("caller checked").to_string_lossy();
+
+    for entry in it {
+        let entry = entry?;
+        if !entry.metadata()?.is_file() {
+            continue;
+        }
+        let file_name = entry.file_name();
+
+        if path.file_name().unwrap() == file_name {
+            // do not remove our actual file
+            continue;
+        }
+
+        let file_name = file_name.to_string_lossy();
+
+        if !file_name.starts_with(&*prefix) {
+            continue;
+        }
+
+        let path = entry.path();
+
+        if let Err(e) = std::fs::remove_file(&path) {
+            tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}");
+        } else {
+            tracing::info!("cleaned up old tempfile {file_name:?}");
+        }
+    }
+
+    Ok(())
+}
+
+pub(super) async fn flush_metrics_to_disk(
+    current_metrics: &Arc<Vec<RawMetric>>,
+    path: &Arc<PathBuf>,
+) -> anyhow::Result<()> {
+    use std::io::Write;
+
+    anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}");
+    anyhow::ensure!(
+        path.file_name().is_some(),
+        "path must have filename: {path:?}"
+    );
+
+    let span = tracing::Span::current();
+    tokio::task::spawn_blocking({
+        let current_metrics = current_metrics.clone();
+        let path = path.clone();
+        move || {
+            let _e = span.entered();
+
+            let parent = path.parent().expect("existence checked");
+            let file_name = path.file_name().expect("existence checked");
+            let mut tempfile = tempfile::Builder::new()
+                .prefix(file_name)
+                .suffix(".tmp")
+                .tempfile_in(parent)?;
+
+            tracing::debug!("using tempfile {:?}", tempfile.path());
+
+            // write out all of the raw metrics, to be read out later on restart as cached values
+            {
+                let mut writer = std::io::BufWriter::new(&mut tempfile);
+                serde_json::to_writer(&mut writer, &*current_metrics)
+                    .context("serialize metrics")?;
+                writer
+                    .into_inner()
+                    .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
+            }
+
+            tempfile.flush()?;
+            tempfile.as_file().sync_all()?;
+
+            fail::fail_point!("before-persist-last-metrics-collected");
+
+            drop(tempfile.persist(&*path).map_err(|e| e.error)?);
+
+            let f = std::fs::File::open(path.parent().unwrap())?;
+            f.sync_all()?;
+
+            anyhow::Ok(())
+        }
+    })
+    .await
+    .with_context(|| format!("write metrics to {path:?} join error"))
+    .and_then(|x| x.with_context(|| format!("write metrics to {path:?}")))
+}
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -0,0 +1,455 @@
+use crate::context::RequestContext;
+use anyhow::Context;
+use chrono::{DateTime, Utc};
+use consumption_metrics::EventType;
+use futures::stream::StreamExt;
+use serde_with::serde_as;
+use std::{sync::Arc, time::SystemTime};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::{Cache, RawMetric};
+
+/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
+/// instead of static str.
+// Do not rename any of these without first consulting with data team and partner
+// management.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
+pub(super) enum Name {
+    /// Timeline last_record_lsn, absolute
+    #[serde(rename = "written_size")]
+    WrittenSize,
+    /// Timeline last_record_lsn, incremental
+    #[serde(rename = "written_data_bytes_delta")]
+    WrittenSizeDelta,
+    /// Timeline logical size
+    #[serde(rename = "timeline_logical_size")]
+    LogicalSize,
+    /// Tenant remote size
+    #[serde(rename = "remote_storage_size")]
+    RemoteSize,
+    /// Tenant resident size
+    #[serde(rename = "resident_size")]
+    ResidentSize,
+    /// Tenant synthetic size
+    #[serde(rename = "synthetic_storage_size")]
+    SyntheticSize,
+}
+
+/// Key that uniquely identifies the object this metric describes.
+///
+/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
+/// elsewhere.
+#[serde_with::serde_as]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
+pub(crate) struct MetricsKey {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub(super) tenant_id: TenantId,
+
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(super) timeline_id: Option<TimelineId>,
+
+    pub(super) metric: Name,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    const fn from_until(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> RawMetric {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        let when = EventType::Incremental {
+            start_time: prev_end,
+            stop_time: up_to,
+        };
+        (key, (when, val))
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSize,
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSizeDelta,
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::LogicalSize,
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::RemoteSize,
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::ResidentSize,
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::SyntheticSize,
+        }
+        .absolute_values()
+    }
+}
+
+pub(super) async fn collect_all_metrics(
+    cached_metrics: &Cache,
+    ctx: &RequestContext,
+) -> Vec<RawMetric> {
+    use pageserver_api::models::TenantState;
+
+    let started_at = std::time::Instant::now();
+
+    let tenants = match crate::tenant::mgr::list_tenants().await {
+        Ok(tenants) => tenants,
+        Err(err) => {
+            tracing::error!("failed to list tenants: {:?}", err);
+            return vec![];
+        }
+    };
+
+    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
+        if state != TenantState::Active {
+            None
+        } else {
+            crate::tenant::mgr::get_tenant(id, true)
+                .await
+                .ok()
+                .map(|tenant| (id, tenant))
+        }
+    });
+
+    let res = collect(tenants, cached_metrics, ctx).await;
+
+    tracing::info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        total = res.len(),
+        "collected metrics"
+    );
+
+    res
+}
+
+async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
+where
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+{
+    let mut current_metrics: Vec<RawMetric> = Vec::new();
+
+    let mut tenants = std::pin::pin!(tenants);
+
+    while let Some((tenant_id, tenant)) = tenants.next().await {
+        let mut tenant_resident_size = 0;
+
+        for timeline in tenant.list_timelines() {
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
+                        tenant_id,
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cache,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    tracing::error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
+            }
+
+            tenant_resident_size += timeline.resident_physical_size();
+        }
+
+        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
+        snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
+    }
+
+    current_metrics
+}
+
+/// In-between abstraction to allow testing metrics without actual Tenants.
+struct TenantSnapshot {
+    resident_size: u64,
+    remote_size: u64,
+    synthetic_size: u64,
+}
+
+impl TenantSnapshot {
+    /// Collect tenant status to have metrics created out of it.
+    ///
+    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
+    /// cannot just list timelines here.
+    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+        TenantSnapshot {
+            resident_size,
+            remote_size: t.remote_size(),
+            // Note that this metric is calculated in a separate bgworker
+            // Here we only use cached value, which may lag behind the real latest one
+            synthetic_size: t.cached_synthetic_size(),
+        }
+    }
+
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        now: DateTime<Utc>,
+        cached: &Cache,
+        metrics: &mut Vec<RawMetric>,
+    ) {
+        let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
+
+        let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
+
+        let synthetic_size = {
+            let factory = MetricsKey::synthetic_size(tenant_id);
+            let mut synthetic_size = self.synthetic_size;
+
+            if synthetic_size == 0 {
+                if let Some((_, value)) = cached.get(factory.key()) {
+                    // use the latest value from previous session
+                    synthetic_size = *value;
+                }
+            }
+
+            if synthetic_size != 0 {
+                // only send non-zeroes because otherwise these show up as errors in logs
+                Some(factory.at(now, synthetic_size))
+            } else {
+                None
+            }
+        };
+
+        metrics.extend(
+            [Some(remote_size), Some(resident_size), synthetic_size]
+                .into_iter()
+                .flatten(),
+        );
+    }
+}
+
+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<RawMetric>,
+        cache: &Cache,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        let up_to = now;
+
+        if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+            let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
+            // written_size_delta
+            metrics.push(key_value);
+            // written_size
+            metrics.push((key, written_size_now));
+        } else {
+            // the cached value was ahead of us, report zero until we've caught up
+            metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
+            // the cached value was ahead of us, report the same until we've caught up
+            metrics.push((key, (written_size_now.0, prev.1)));
+        }
+
+        {
+            let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
+            let current_or_previous = self
+                .current_exact_logical_size
+                .or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
+
+            if let Some(size) = current_or_previous {
+                metrics.push(factory.at(now, size));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests;
+
+#[cfg(test)]
+pub(crate) use tests::metric_examples;
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -0,0 +1,297 @@
+use super::*;
+use std::collections::HashMap;
+use std::time::SystemTime;
+use utils::lsn::Lsn;
+
+#[test]
+fn startup_collected_timeline_metrics_before_advancing() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::new();
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, SystemTime::now()),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    let now = DateTime::<Utc>::from(SystemTime::now());
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                snap.loaded_at.1.into(),
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn startup_collected_timeline_metrics_second_round() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, before, init] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let before = DateTime::<Utc>::from(before);
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+    ]);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, init),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, just_before, before, init] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let just_before = DateTime::<Utc>::from(just_before);
+    let before = DateTime::<Utc>::from(before);
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::from([
+        // at t=before was the last time the last_record_lsn changed
+        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+        // end time of this event is used for the next ones
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
+    ]);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, init),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
+    // it can happen that we lose the inmemorylayer but have previously sent metrics and we
+    // should never go backwards
+
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [later, now, at_restart] = time_backwards();
+
+    // FIXME: tests would be so much easier if we did not need to juggle back and forth
+    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
+    let now = DateTime::<Utc>::from(now);
+    let later = DateTime::<Utc>::from(later);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+    let way_before = DateTime::<Utc>::from(way_before);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        current_exact_logical_size: None,
+    };
+
+    let mut cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+            way_before,
+            before_restart,
+            // not taken into account, but the timestamps are important
+            999_999_999,
+        ),
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                before_restart,
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+        ]
+    );
+
+    // now if we cache these metrics, and re-run while "still in recovery"
+    cache.extend(metrics.drain(..));
+
+    // "still in recovery", because our snapshot did not change
+    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+        ]
+    );
+}
+
+#[test]
+fn post_restart_current_exact_logical_size_uses_cached() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, at_restart] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        current_exact_logical_size: None,
+    };
+
+    let cache = HashMap::from([
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
+
+    assert_eq!(
+        metrics,
+        &[MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 100)]
+    );
+}
+
+#[test]
+fn post_restart_synthetic_size_uses_cached_if_available() {
+    let tenant_id = TenantId::generate();
+
+    let ts = TenantSnapshot {
+        resident_size: 1000,
+        remote_size: 1000,
+        // not yet calculated
+        synthetic_size: 0,
+    };
+
+    let now = SystemTime::now();
+    let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
+    let now = DateTime::<Utc>::from(now);
+
+    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
+
+    let mut metrics = Vec::new();
+    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
+            MetricsKey::synthetic_size(tenant_id).at(now, 1000),
+        ]
+    );
+}
+
+#[test]
+fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
+    let tenant_id = TenantId::generate();
+
+    let ts = TenantSnapshot {
+        resident_size: 1000,
+        remote_size: 1000,
+        // not yet calculated
+        synthetic_size: 0,
+    };
+
+    let now = SystemTime::now();
+    let now = DateTime::<Utc>::from(now);
+
+    let cached = HashMap::new();
+
+    let mut metrics = Vec::new();
+    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
+            // no synthetic size here
+        ]
+    );
+}
+
+fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+    let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+    times[0] = std::time::SystemTime::now();
+    for behind in 1..N {
+        times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+    }
+
+    times
+}
+
+pub(crate) const fn metric_examples(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    now: DateTime<Utc>,
+    before: DateTime<Utc>,
+) -> [RawMetric; 6] {
+    [
+        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
+        MetricsKey::resident_size(tenant_id).at(now, 0),
+        MetricsKey::synthetic_size(tenant_id).at(now, 1),
+    ]
+}
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -0,0 +1,443 @@
+use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use serde_with::serde_as;
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+
+use super::{metrics::Name, Cache, MetricsKey, RawMetric};
+use utils::id::{TenantId, TimelineId};
+
+/// How the metrics from pageserver are identified.
+#[serde_with::serde_as]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
+struct Ids {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub(super) tenant_id: TenantId,
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(super) timeline_id: Option<TimelineId>,
+}
+
+#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
+pub(super) async fn upload_metrics(
+    client: &reqwest::Client,
+    metric_collection_endpoint: &reqwest::Url,
+    cancel: &CancellationToken,
+    node_id: &str,
+    metrics: &[RawMetric],
+    cached_metrics: &mut Cache,
+) -> anyhow::Result<()> {
+    let mut uploaded = 0;
+    let mut failed = 0;
+
+    let started_at = std::time::Instant::now();
+
+    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
+
+    while let Some(res) = iter.next() {
+        let (chunk, body) = res?;
+
+        let event_bytes = body.len();
+
+        let is_last = iter.len() == 0;
+
+        let res = upload(client, metric_collection_endpoint, body, cancel, is_last)
+            .instrument(tracing::info_span!(
+                "upload",
+                %event_bytes,
+                uploaded,
+                total = metrics.len(),
+            ))
+            .await;
+
+        match res {
+            Ok(()) => {
+                for (curr_key, curr_val) in chunk {
+                    cached_metrics.insert(*curr_key, *curr_val);
+                }
+                uploaded += chunk.len();
+            }
+            Err(_) => {
+                // failure(s) have already been logged
+                //
+                // however this is an inconsistency: if we crash here, we will start with the
+                // values as uploaded. in practice, the rejections no longer happen.
+                failed += chunk.len();
+            }
+        }
+    }
+
+    let elapsed = started_at.elapsed();
+
+    tracing::info!(
+        uploaded,
+        failed,
+        elapsed_ms = elapsed.as_millis(),
+        "done sending metrics"
+    );
+
+    Ok(())
+}
+
+// The return type is quite ugly, but we gain testability in isolation
+fn serialize_in_chunks<'a, F>(
+    chunk_size: usize,
+    input: &'a [RawMetric],
+    factory: F,
+) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
+where
+    F: KeyGen<'a> + 'a,
+{
+    use bytes::BufMut;
+
+    struct Iter<'a, F> {
+        inner: std::slice::Chunks<'a, RawMetric>,
+        chunk_size: usize,
+
+        // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
+        buffer: bytes::BytesMut,
+        // chunk amount of events are reused to produce the serialized document
+        scratch: Vec<Event<Ids, Name>>,
+        factory: F,
+    }
+
+    impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
+        type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
+
+        fn next(&mut self) -> Option<Self::Item> {
+            let chunk = self.inner.next()?;
+
+            if self.scratch.is_empty() {
+                // first round: create events with N strings
+                self.scratch.extend(
+                    chunk
+                        .iter()
+                        .map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
+                );
+            } else {
+                // next rounds: update_in_place to reuse allocations
+                assert_eq!(self.scratch.len(), self.chunk_size);
+                self.scratch
+                    .iter_mut()
+                    .zip(chunk.iter())
+                    .for_each(|(slot, raw_metric)| {
+                        raw_metric.update_in_place(slot, &self.factory.generate())
+                    });
+            }
+
+            let res = serde_json::to_writer(
+                (&mut self.buffer).writer(),
+                &EventChunk {
+                    events: (&self.scratch[..chunk.len()]).into(),
+                },
+            );
+
+            match res {
+                Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))),
+                Err(e) => Some(Err(e)),
+            }
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            self.inner.size_hint()
+        }
+    }
+
+    impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
+
+    let buffer = bytes::BytesMut::new();
+    let inner = input.chunks(chunk_size);
+    let scratch = Vec::new();
+
+    Iter {
+        inner,
+        chunk_size,
+        buffer,
+        scratch,
+        factory,
+    }
+}
+
+trait RawMetricExt {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
+}
+
+impl RawMetricExt for RawMetric {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.0;
+
+        let (kind, value) = self.1;
+
+        Event {
+            kind,
+            metric,
+            idempotency_key: key.to_string(),
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        }
+    }
+
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
+        use std::fmt::Write;
+
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.0;
+
+        let (kind, value) = self.1;
+
+        *event = Event {
+            kind,
+            metric,
+            idempotency_key: {
+                event.idempotency_key.clear();
+                write!(event.idempotency_key, "{key}").unwrap();
+                std::mem::take(&mut event.idempotency_key)
+            },
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        };
+    }
+}
+
+trait KeyGen<'a>: Copy {
+    fn generate(&self) -> IdempotencyKey<'a>;
+}
+
+impl<'a> KeyGen<'a> for &'a str {
+    fn generate(&self) -> IdempotencyKey<'a> {
+        IdempotencyKey::generate(self)
+    }
+}
+
+enum UploadError {
+    Rejected(reqwest::StatusCode),
+    Reqwest(reqwest::Error),
+    Cancelled,
+}
+
+impl std::fmt::Debug for UploadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // use same impl because backoff::retry will log this using both
+        std::fmt::Display::fmt(self, f)
+    }
+}
+
+impl std::fmt::Display for UploadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use UploadError::*;
+
+        match self {
+            Rejected(code) => write!(f, "server rejected the metrics with {code}"),
+            Reqwest(e) => write!(f, "request failed: {e}"),
+            Cancelled => write!(f, "cancelled"),
+        }
+    }
+}
+
+impl UploadError {
+    fn is_reject(&self) -> bool {
+        matches!(self, UploadError::Rejected(_))
+    }
+}
+
+// this is consumed by the test verifiers
+static LAST_IN_BATCH: reqwest::header::HeaderName =
+    reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch");
+
+async fn upload(
+    client: &reqwest::Client,
+    metric_collection_endpoint: &reqwest::Url,
+    body: bytes::Bytes,
+    cancel: &CancellationToken,
+    is_last: bool,
+) -> Result<(), UploadError> {
+    let warn_after = 3;
+    let max_attempts = 10;
+    let res = utils::backoff::retry(
+        move || {
+            let body = body.clone();
+            async move {
+                let res = client
+                    .post(metric_collection_endpoint.clone())
+                    .header(reqwest::header::CONTENT_TYPE, "application/json")
+                    .header(
+                        LAST_IN_BATCH.clone(),
+                        if is_last { "true" } else { "false" },
+                    )
+                    .body(body)
+                    .send()
+                    .await;
+
+                let res = res.and_then(|res| res.error_for_status());
+
+                // 10 redirects are normally allowed, so we don't need worry about 3xx
+                match res {
+                    Ok(_response) => Ok(()),
+                    Err(e) => {
+                        let status = e.status().filter(|s| s.is_client_error());
+                        if let Some(status) = status {
+                            // rejection used to be a thing when the server could reject a
+                            // whole batch of metrics if one metric was bad.
+                            Err(UploadError::Rejected(status))
+                        } else {
+                            Err(UploadError::Reqwest(e))
+                        }
+                    }
+                }
+            }
+        },
+        UploadError::is_reject,
+        warn_after,
+        max_attempts,
+        "upload consumption_metrics",
+        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
+    )
+    .await;
+
+    match &res {
+        Ok(_) => {}
+        Err(e) if e.is_reject() => {
+            // permanent errors currently do not get logged by backoff::retry
+            // display alternate has no effect, but keeping it here for easier pattern matching.
+            tracing::error!("failed to upload metrics: {e:#}");
+        }
+        Err(_) => {
+            // these have been logged already
+        }
+    }
+
+    res
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::{DateTime, Utc};
+    use once_cell::sync::Lazy;
+
+    #[test]
+    fn chunked_serialization() {
+        let examples = metric_samples();
+        assert!(examples.len() > 1);
+
+        let factory = FixedGen::new(Utc::now(), "1", 42);
+
+        // need to use Event here because serde_json::Value uses default hashmap, not linked
+        // hashmap
+        #[derive(serde::Deserialize)]
+        struct EventChunk {
+            events: Vec<Event<Ids, Name>>,
+        }
+
+        let correct = serialize_in_chunks(examples.len(), &examples, factory)
+            .map(|res| res.unwrap().1)
+            .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
+            .collect::<Vec<_>>();
+
+        for chunk_size in 1..examples.len() {
+            let actual = serialize_in_chunks(chunk_size, &examples, factory)
+                .map(|res| res.unwrap().1)
+                .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
+                .collect::<Vec<_>>();
+
+            // if these are equal, it means that multi-chunking version works as well
+            assert_eq!(correct, actual);
+        }
+    }
+
+    #[derive(Clone, Copy)]
+    struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);
+
+    impl<'a> FixedGen<'a> {
+        fn new(now: chrono::DateTime<chrono::Utc>, node_id: &'a str, nonce: u16) -> Self {
+            FixedGen(now, node_id, nonce)
+        }
+    }
+
+    impl<'a> KeyGen<'a> for FixedGen<'a> {
+        fn generate(&self) -> IdempotencyKey<'a> {
+            IdempotencyKey::for_tests(self.0, self.1, self.2)
+        }
+    }
+
+    static SAMPLES_NOW: Lazy<DateTime<Utc>> = Lazy::new(|| {
+        DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z")
+            .unwrap()
+            .into()
+    });
+
+    #[test]
+    fn metric_image_stability() {
+        // it is important that these strings stay as they are
+
+        let examples = [
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+        ];
+
+        let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
+        let examples = examples.into_iter().zip(metric_samples());
+
+        for ((line, expected), (key, (kind, value))) in examples {
+            let e = consumption_metrics::Event {
+                kind,
+                metric: key.metric,
+                idempotency_key: idempotency_key.to_string(),
+                value,
+                extra: Ids {
+                    tenant_id: key.tenant_id,
+                    timeline_id: key.timeline_id,
+                },
+            };
+            let actual = serde_json::to_string(&e).unwrap();
+            assert_eq!(expected, actual, "example for {kind:?} from line {line}");
+        }
+    }
+
+    fn metric_samples() -> [RawMetric; 6] {
+        let tenant_id = TenantId::from_array([0; 16]);
+        let timeline_id = TimelineId::from_array([0xff; 16]);
+
+        let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
+            .unwrap()
+            .into();
+        let [now, before] = [*SAMPLES_NOW, before];
+
+        super::super::metrics::metric_examples(tenant_id, timeline_id, now, before)
+    }
+}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -94,6 +94,18 @@ pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
+    page_content_kind: PageContentKind,
+}
+
+/// The kind of access to the page cache.
+#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub enum PageContentKind {
+    Unknown,
+    DeltaLayerBtreeNode,
+    DeltaLayerValue,
+    ImageLayerBtreeNode,
+    ImageLayerValue,
+    InMemoryLayer,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -137,6 +149,7 @@ impl RequestContextBuilder {
                task_kind,
                download_behavior: DownloadBehavior::Download,
                access_stats_behavior: AccessStatsBehavior::Update,
+                page_content_kind: PageContentKind::Unknown,
            },
        }
    }
@@ -149,6 +162,7 @@ impl RequestContextBuilder {
                task_kind: original.task_kind,
                download_behavior: original.download_behavior,
                access_stats_behavior: original.access_stats_behavior,
+                page_content_kind: original.page_content_kind,
            },
        }
    }
@@ -167,6 +181,11 @@ impl RequestContextBuilder {
        self
    }

+    pub(crate) fn page_content_kind(mut self, k: PageContentKind) -> Self {
+        self.inner.page_content_kind = k;
+        self
+    }
+
    pub fn build(self) -> RequestContext {
        self.inner
    }
@@ -263,4 +282,8 @@ impl RequestContext {
    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
        self.access_stats_behavior
    }
+
+    pub(crate) fn page_content_kind(&self) -> PageContentKind {
+        self.page_content_kind
+    }
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -0,0 +1,179 @@
+use std::collections::HashMap;
+
+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+};
+use serde::{de::DeserializeOwned, Serialize};
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{
+    backoff,
+    generation::Generation,
+    id::{NodeId, TenantId},
+};
+
+use crate::config::PageServerConf;
+
+/// The Pageserver's client for using the control plane API: this is a small subset
+/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
+pub struct ControlPlaneClient {
+    http_client: reqwest::Client,
+    base_url: Url,
+    node_id: NodeId,
+    cancel: CancellationToken,
+}
+
+/// Represent operations which internally retry on all errors other than
+/// cancellation token firing: the only way they can fail is ShuttingDown.
+pub enum RetryForeverError {
+    ShuttingDown,
+}
+
+#[async_trait::async_trait]
+pub trait ControlPlaneGenerationsApi {
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+}
+
+impl ControlPlaneClient {
+    /// A None return value indicates that the input `conf` object does not have control
+    /// plane API enabled.
+    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+        let mut url = match conf.control_plane_api.as_ref() {
+            Some(u) => u.clone(),
+            None => return None,
+        };
+
+        if let Ok(mut segs) = url.path_segments_mut() {
+            // This ensures that `url` ends with a slash if it doesn't already.
+            // That way, we can subsequently use join() to safely attach extra path elements.
+            segs.pop_if_empty().push("");
+        }
+
+        let mut client = reqwest::ClientBuilder::new();
+
+        if let Some(jwt) = &conf.control_plane_api_token {
+            let mut headers = hyper::HeaderMap::new();
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            client = client.default_headers(headers);
+        }
+
+        Some(Self {
+            http_client: client.build().expect("Failed to construct HTTP client"),
+            base_url: url,
+            node_id: conf.id,
+            cancel: cancel.clone(),
+        })
+    }
+
+    async fn retry_http_forever<R, T>(
+        &self,
+        url: &url::Url,
+        request: R,
+    ) -> Result<T, RetryForeverError>
+    where
+        R: Serialize,
+        T: DeserializeOwned,
+    {
+        #[derive(thiserror::Error, Debug)]
+        enum RemoteAttemptError {
+            #[error("shutdown")]
+            Shutdown,
+            #[error("remote: {0}")]
+            Remote(reqwest::Error),
+        }
+
+        match backoff::retry(
+            || async {
+                let response = self
+                    .http_client
+                    .post(url.clone())
+                    .json(&request)
+                    .send()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)?;
+
+                response
+                    .error_for_status_ref()
+                    .map_err(RemoteAttemptError::Remote)?;
+                response
+                    .json::<T>()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "calling control plane generation validation API",
+            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+        )
+        .await
+        {
+            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+            Err(RemoteAttemptError::Remote(_)) => {
+                panic!("We retry forever, this should never be reached");
+            }
+            Ok(r) => Ok(r),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ControlPlaneGenerationsApi for ControlPlaneClient {
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+        let re_attach_path = self
+            .base_url
+            .join("re-attach")
+            .expect("Failed to build re-attach path");
+        let request = ReAttachRequest {
+            node_id: self.node_id,
+        };
+
+        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+        tracing::info!(
+            "Received re-attach response with {} tenants",
+            response.tenants.len()
+        );
+
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|t| (t.id, Generation::new(t.generation)))
+            .collect::<HashMap<_, _>>())
+    }
+
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        let re_attach_path = self
+            .base_url
+            .join("validate")
+            .expect("Failed to build validate path");
+
+        let request = ValidateRequest {
+            tenants: tenants
+                .into_iter()
+                .map(|(id, gen)| ValidateRequestTenant {
+                    id,
+                    gen: gen
+                        .into()
+                        .expect("Generation should always be valid for a Tenant doing deletions"),
+                })
+                .collect(),
+        };
+
+        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
+
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|rt| (rt.id, rt.valid))
+            .collect())
+    }
+}
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -0,0 +1,156 @@
+//! The deleter is the final stage in the deletion queue.  It accumulates remote
+//! paths to delete, and periodically executes them in batches of up to 1000
+//! using the DeleteObjects request.
+//!
+//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
+//! number of full-sized DeleteObjects requests, rather than a larger number of
+//! smaller requests.
+
+use remote_storage::GenericRemoteStorage;
+use remote_storage::RemotePath;
+use remote_storage::MAX_KEYS_PER_DELETE;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use tracing::warn;
+
+use crate::metrics;
+
+use super::DeletionQueueError;
+use super::FlushOp;
+
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+pub(super) enum DeleterMessage {
+    Delete(Vec<RemotePath>),
+    Flush(FlushOp),
+}
+
+/// Non-persistent deletion queue, for coalescing multiple object deletes into
+/// larger DeleteObjects requests.
+pub(super) struct Deleter {
+    // Accumulate up to 1000 keys for the next deletion operation
+    accumulator: Vec<RemotePath>,
+
+    rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+
+    cancel: CancellationToken,
+    remote_storage: GenericRemoteStorage,
+}
+
+impl Deleter {
+    pub(super) fn new(
+        remote_storage: GenericRemoteStorage,
+        rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            remote_storage,
+            rx,
+            cancel,
+            accumulator: Vec::new(),
+        }
+    }
+
+    /// Wrap the remote `delete_objects` with a failpoint
+    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
+        self.remote_storage.delete_objects(&self.accumulator).await
+    }
+
+    /// Block until everything in accumulator has been executed
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+            match self.remote_delete().await {
+                Ok(()) => {
+                    // Note: we assume that the remote storage layer returns Ok(()) if some
+                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
+                    info!(
+                        "Executed deletion batch {}..{}",
+                        self.accumulator
+                            .first()
+                            .expect("accumulator should be non-empty"),
+                        self.accumulator
+                            .last()
+                            .expect("accumulator should be non-empty"),
+                    );
+                    self.accumulator.clear();
+                }
+                Err(e) => {
+                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
+                }
+            };
+        }
+        if self.cancel.is_cancelled() {
+            // Expose an error because we may not have actually flushed everything
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
+        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(DeletionQueueError::ShuttingDown);
+            }
+
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending
+                    self.flush().await?;
+
+                    continue;
+                }
+            };
+
+            match msg {
+                DeleterMessage::Delete(mut list) => {
+                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                            self.flush().await?;
+                            // If we have received this number of keys, proceed with attempting to execute
+                            assert_eq!(self.accumulator.len(), 0);
+                        }
+
+                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let take_count = std::cmp::min(available_slots, list.len());
+                        for path in list.drain(list.len() - take_count..) {
+                            self.accumulator.push(path);
+                        }
+                    }
+                }
+                DeleterMessage::Flush(flush_op) => {
+                    // If flush() errors, we drop the flush_op and the caller will get
+                    // an error recv()'ing their oneshot channel.
+                    self.flush().await?;
+                    flush_op.notify();
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -0,0 +1,487 @@
+//! The list writer is the first stage in the deletion queue.  It accumulates
+//! layers to delete, and periodically writes out these layers into a persistent
+//! DeletionList.
+//!
+//! The purpose of writing DeletionLists is to decouple the decision to
+//! delete an object from the validation required to execute it: even if
+//! validation is not possible, e.g. due to a control plane outage, we can
+//! still persist our intent to delete an object, in a way that would
+//! survive a restart.
+//!
+//! DeletionLists are passed onwards to the Validator.
+
+use super::DeletionHeader;
+use super::DeletionList;
+use super::FlushOp;
+use super::ValidatorQueueMessage;
+
+use std::collections::HashMap;
+use std::fs::create_dir_all;
+use std::time::Duration;
+
+use regex::Regex;
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+use utils::generation::Generation;
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use crate::config::PageServerConf;
+use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::storage_layer::LayerFileName;
+
+// The number of keys in a DeletionList before we will proactively persist it
+// (without reaching a flush deadline).  This aims to deliver objects of the order
+// of magnitude 1MB when we are under heavy delete load.
+const DELETION_LIST_TARGET_SIZE: usize = 16384;
+
+// Ordinarily, we only flush to DeletionList periodically, to bound the window during
+// which we might leak objects from not flushing a DeletionList after
+// the objects are already unlinked from timeline metadata.
+const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
+
+// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
+// more objects before doing the flush.
+const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
+
+#[derive(Debug)]
+pub(super) struct DeletionOp {
+    pub(super) tenant_id: TenantId,
+    pub(super) timeline_id: TimelineId,
+    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
+    // have a config object handy to project it to a remote key, and need the consuming worker
+    // to do it for you.
+    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) objects: Vec<RemotePath>,
+
+    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// this deletion.
+    pub(super) generation: Generation,
+}
+
+#[derive(Debug)]
+pub(super) struct RecoverOp {
+    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+}
+
+#[derive(Debug)]
+pub(super) enum ListWriterQueueMessage {
+    Delete(DeletionOp),
+    // Wait until all prior deletions make it into a persistent DeletionList
+    Flush(FlushOp),
+    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
+    FlushExecute(FlushOp),
+    // Call once after re-attaching to control plane, to notify the deletion queue about
+    // latest attached generations & load any saved deletion lists from disk.
+    Recover(RecoverOp),
+}
+
+pub(super) struct ListWriter {
+    conf: &'static PageServerConf,
+
+    // Incoming frontend requests to delete some keys
+    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+
+    // Outbound requests to the backend to execute deletion lists we have composed.
+    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+
+    // The list we are currently building, contains a buffer of keys to delete
+    // and our next sequence number
+    pending: DeletionList,
+
+    // These FlushOps should notify the next time we flush
+    pending_flushes: Vec<FlushOp>,
+
+    // Worker loop is torn down when this fires.
+    cancel: CancellationToken,
+
+    // Safety guard to do recovery exactly once
+    recovered: bool,
+}
+
+impl ListWriter {
+    // Initially DeletionHeader.validated_sequence is zero.  The place we start our
+    // sequence numbers must be higher than that.
+    const BASE_SEQUENCE: u64 = 1;
+
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            pending: DeletionList::new(Self::BASE_SEQUENCE),
+            conf,
+            rx,
+            tx,
+            pending_flushes: Vec::new(),
+            cancel,
+            recovered: false,
+        }
+    }
+
+    /// Try to flush `list` to persistent storage
+    ///
+    /// This does not return errors, because on failure to flush we do not lose
+    /// any state: flushing will be retried implicitly on the next deadline
+    async fn flush(&mut self) {
+        if self.pending.is_empty() {
+            for f in self.pending_flushes.drain(..) {
+                f.notify();
+            }
+            return;
+        }
+
+        match self.pending.save(self.conf).await {
+            Ok(_) => {
+                info!(sequence = self.pending.sequence, "Stored deletion list");
+
+                for f in self.pending_flushes.drain(..) {
+                    f.notify();
+                }
+
+                // Take the list we've accumulated, replace it with a fresh list for the next sequence
+                let next_list = DeletionList::new(self.pending.sequence + 1);
+                let list = std::mem::replace(&mut self.pending, next_list);
+
+                if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
+                    // This is allowed to fail: it will only happen if the backend worker is shut down,
+                    // so we can just drop this on the floor.
+                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
+                }
+            }
+            Err(e) => {
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                warn!(
+                    sequence = self.pending.sequence,
+                    "Failed to write deletion list, will retry later ({e:#})"
+                );
+            }
+        }
+    }
+
+    /// Load the header, to learn the sequence number up to which deletions
+    /// have been validated.  We will apply validated=true to DeletionLists
+    /// <= this sequence when loading them.
+    ///
+    /// It is not an error for the header to not exist: we return None, and
+    /// the caller should act as if validated_sequence is 0
+    async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
+        let header_path = self.conf.deletion_header_path();
+        match tokio::fs::read(&header_path).await {
+            Ok(header_bytes) => {
+                match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
+                    Ok(h) => Ok(Some(h.validated_sequence)),
+                    Err(e) => {
+                        warn!(
+                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
+                            header_path.display()
+                        );
+                        // This should never happen unless we make a mistake with our serialization.
+                        // Ignoring a deletion header is not consequential for correctnes because all deletions
+                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        Ok(None)
+                    }
+                }
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    debug!(
+                        "Deletion header {} not found, first start?",
+                        header_path.display()
+                    );
+                    Ok(None)
+                } else {
+                    Err(anyhow::anyhow!(e))
+                }
+            }
+        }
+    }
+
+    async fn recover(
+        &mut self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), anyhow::Error> {
+        debug!(
+            "recovering with {} attached tenants",
+            attached_tenants.len()
+        );
+
+        // Load the header
+        let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
+
+        self.pending.sequence = validated_sequence + 1;
+
+        let deletion_directory = self.conf.deletion_prefix();
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!(
+                    "Failed to open deletion list directory {}: {e:#}",
+                    deletion_directory.display(),
+                );
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };
+
+        let list_name_pattern =
+            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
+
+        let header_path = self.conf.deletion_header_path();
+        let mut seqs: Vec<u64> = Vec::new();
+        while let Some(dentry) = dir.next_entry().await? {
+            let file_name = dentry.file_name();
+            let dentry_str = file_name.to_string_lossy();
+
+            if Some(file_name.as_os_str()) == header_path.file_name() {
+                // Don't try and parse the header's name like a list
+                continue;
+            }
+
+            if dentry_str.ends_with(TEMP_SUFFIX) {
+                info!("Cleaning up temporary file {dentry_str}");
+                let absolute_path = deletion_directory.join(dentry.file_name());
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!(
+                        "Failed to clean up temporary file {}: {e:#}",
+                        absolute_path.display()
+                    );
+                }
+
+                continue;
+            }
+
+            let file_name = dentry.file_name().to_owned();
+            let basename = file_name.to_string_lossy();
+            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
+                m.name("sequence")
+                    .expect("Non optional group should be present")
+                    .as_str()
+            } else {
+                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                continue;
+            };
+
+            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
+                Ok(s) => s,
+                Err(e) => {
+                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+            seqs.push(seq);
+        }
+        seqs.sort();
+
+        // Start our next deletion list from after the last location validated by
+        // previous process lifetime, or after the last location found (it is updated
+        // below after enumerating the deletion lists)
+        self.pending.sequence = validated_sequence + 1;
+        if let Some(max_list_seq) = seqs.last() {
+            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
+        }
+
+        for s in seqs {
+            let list_path = self.conf.deletion_list_path(s);
+
+            let list_bytes = tokio::fs::read(&list_path).await?;
+
+            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
+                Ok(l) => l,
+                Err(e) => {
+                    // Drop the list on the floor: any objects it referenced will be left behind
+                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
+                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+
+            if deletion_list.sequence <= validated_sequence {
+                // If the deletion list falls below valid_seq, we may assume that it was
+                // already validated the last time this pageserver ran.  Otherwise, we still
+                // load it, as it may still contain content valid in this generation.
+                deletion_list.validated = true;
+            } else {
+                // Special case optimization: if a tenant is still attached, and no other
+                // generation was issued to another node in the interval while we restarted,
+                // then we may treat deletion lists from the previous generation as if they
+                // belong to our currently attached generation, and proceed to validate & execute.
+                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                        if attached_gen.previous() == tenant_list.generation {
+                            tenant_list.generation = *attached_gen;
+                        }
+                    }
+                }
+            }
+
+            info!(
+                validated = deletion_list.validated,
+                sequence = deletion_list.sequence,
+                "Recovered deletion list"
+            );
+
+            // We will drop out of recovery if this fails: it indicates that we are shutting down
+            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
+            self.tx
+                .send(ValidatorQueueMessage::Delete(deletion_list))
+                .await?;
+        }
+
+        info!(next_sequence = self.pending.sequence, "Replay complete");
+
+        Ok(())
+    }
+
+    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
+    /// and write them out, for later validation by the backend and execution by the executor.
+    pub(super) async fn background(&mut self) {
+        info!("Started deletion frontend worker");
+
+        // Synchronous, but we only do it once per process lifetime so it's tolerable
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+            tracing::error!(
+                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
+                self.conf.deletion_prefix().display()
+            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
+            return;
+        }
+
+        while !self.cancel.is_cancelled() {
+            let timeout = if self.pending_flushes.is_empty() {
+                FRONTEND_DEFAULT_TIMEOUT
+            } else {
+                FRONTEND_FLUSHING_TIMEOUT
+            };
+
+            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
+                Ok(Some(msg)) => msg,
+                Ok(None) => {
+                    // Queue sender destroyed, shutting down
+                    break;
+                }
+                Err(_) => {
+                    // Hit deadline, flush.
+                    self.flush().await;
+                    continue;
+                }
+            };
+
+            match msg {
+                ListWriterQueueMessage::Delete(op) => {
+                    assert!(
+                        self.recovered,
+                        "Cannot process deletions before recovery.  This is a bug."
+                    );
+
+                    debug!(
+                        "Delete: ingesting {} layers, {} other objects",
+                        op.layers.len(),
+                        op.objects.len()
+                    );
+
+                    let mut layer_paths = Vec::new();
+                    for (layer, generation) in op.layers {
+                        layer_paths.push(remote_layer_path(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            &layer,
+                            generation,
+                        ));
+                    }
+                    layer_paths.extend(op.objects);
+
+                    if !self.pending.push(
+                        &op.tenant_id,
+                        &op.timeline_id,
+                        op.generation,
+                        &mut layer_paths,
+                    ) {
+                        self.flush().await;
+                        let retry_succeeded = self.pending.push(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            op.generation,
+                            &mut layer_paths,
+                        );
+                        if !retry_succeeded {
+                            // Unexpected: after we flush, we should have
+                            // drained self.pending, so a conflict on
+                            // generation numbers should be impossible.
+                            tracing::error!(
+                                "Failed to enqueue deletions, leaking objects.  This is a bug."
+                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        }
+                    }
+                }
+                ListWriterQueueMessage::Flush(op) => {
+                    if self.pending.is_empty() {
+                        // Execute immediately
+                        debug!("Flush: No pending objects, flushing immediately");
+                        op.notify()
+                    } else {
+                        // Execute next time we flush
+                        debug!("Flush: adding to pending flush list for next deadline flush");
+                        self.pending_flushes.push(op);
+                    }
+                }
+                ListWriterQueueMessage::FlushExecute(op) => {
+                    debug!("FlushExecute: passing through to backend");
+                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
+                    if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
+                        info!("Can't flush, shutting down ({e})");
+                        // Caller will get error when their oneshot sender was dropped.
+                    }
+                }
+                ListWriterQueueMessage::Recover(op) => {
+                    if self.recovered {
+                        tracing::error!(
+                            "Deletion queue recovery called more than once.  This is a bug."
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
+                        continue;
+                    }
+
+                    if let Err(e) = self.recover(op.attached_tenants).await {
+                        // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
+                        // queue receiver has been dropped, or something is critically broken with
+                        // the local filesystem holding deletion lists.
+                        info!(
+                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        return;
+                    } else {
+                        self.recovered = true;
+                    }
+                }
+            }
+
+            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
+                self.flush().await;
+            }
+        }
+        info!("Deletion queue shut down.");
+    }
+}
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -0,0 +1,414 @@
+//! The validator is responsible for validating DeletionLists for execution,
+//! based on whethe the generation in the DeletionList is still the latest
+//! generation for a tenant.
+//!
+//! The purpose of validation is to ensure split-brain safety in the cluster
+//! of pageservers: a deletion may only be executed if the tenant generation
+//! that originated it is still current.  See docs/rfcs/025-generation-numbers.md
+//! The purpose of accumulating lists before validating them is to reduce load
+//! on the control plane API by issuing fewer, larger requests.
+//!
+//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
+//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
+//! to decide when old
+//!
+//! Deletions are passed onward to the Deleter.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+
+use crate::config::PageServerConf;
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::control_plane_client::RetryForeverError;
+use crate::metrics;
+
+use super::deleter::DeleterMessage;
+use super::DeletionHeader;
+use super::DeletionList;
+use super::DeletionQueueError;
+use super::FlushOp;
+use super::VisibleLsnUpdates;
+
+// After this length of time, do any validation work that is pending,
+// even if we haven't accumulated many keys to delete.
+//
+// This also causes updates to remote_consistent_lsn to be validated, even
+// if there were no deletions enqueued.
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+// If we have received this number of keys, proceed with attempting to execute
+const AUTOFLUSH_KEY_COUNT: usize = 16384;
+
+#[derive(Debug)]
+pub(super) enum ValidatorQueueMessage {
+    Delete(DeletionList),
+    Flush(FlushOp),
+}
+pub(super) struct Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    conf: &'static PageServerConf,
+    rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+    tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    // Client for calling into control plane API for validation of deletes
+    control_plane_client: Option<C>,
+
+    // DeletionLists which are waiting generation validation.  Not safe to
+    // execute until [`validate`] has processed them.
+    pending_lists: Vec<DeletionList>,
+
+    // DeletionLists which have passed validation and are ready to execute.
+    validated_lists: Vec<DeletionList>,
+
+    // Sum of all the lengths of lists in pending_lists
+    pending_key_count: usize,
+
+    // Lsn validation state: we read projected LSNs and write back visible LSNs
+    // after validation.  This is the LSN equivalent of `pending_validation_lists`:
+    // it is drained in [`validate`]
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+
+    // If we failed to rewrite a deletion list due to local filesystem I/O failure,
+    // we must remember that and refuse to advance our persistent validated sequence
+    // number past the failure.
+    list_write_failed: Option<u64>,
+
+    cancel: CancellationToken,
+}
+
+impl<C> Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        control_plane_client: Option<C>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            conf,
+            rx,
+            tx,
+            control_plane_client,
+            lsn_table,
+            pending_lists: Vec::new(),
+            validated_lists: Vec::new(),
+            pending_key_count: 0,
+            list_write_failed: None,
+            cancel,
+        }
+    }
+    /// Process any outstanding validations of generations of pending LSN updates or pending
+    /// DeletionLists.
+    ///
+    /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
+    /// go into the queue of ready-to-execute lists.
+    async fn validate(&mut self) -> Result<(), DeletionQueueError> {
+        let mut tenant_generations = HashMap::new();
+        for list in &self.pending_lists {
+            for (tenant_id, tenant_list) in &list.tenants {
+                // Note: DeletionLists are in logical time order, so generation always
+                // goes up.  By doing a simple insert() we will always end up with
+                // the latest generation seen for a tenant.
+                tenant_generations.insert(*tenant_id, tenant_list.generation);
+            }
+        }
+
+        let pending_lsn_updates = {
+            let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
+            std::mem::take(&mut *lsn_table)
+        };
+        for (tenant_id, update) in &pending_lsn_updates.tenants {
+            let entry = tenant_generations
+                .entry(*tenant_id)
+                .or_insert(update.generation);
+            if update.generation > *entry {
+                *entry = update.generation;
+            }
+        }
+
+        if tenant_generations.is_empty() {
+            // No work to do
+            return Ok(());
+        }
+
+        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
+            match control_plane_client
+                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+                .await
+            {
+                Ok(tenants) => tenants,
+                Err(RetryForeverError::ShuttingDown) => {
+                    // The only way a validation call returns an error is when the cancellation token fires
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+            }
+        } else {
+            // Control plane API disabled.  In legacy mode we consider everything valid.
+            tenant_generations.keys().map(|k| (*k, true)).collect()
+        };
+
+        let mut validated_sequence: Option<u64> = None;
+
+        // Apply the validation results to the pending LSN updates
+        for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
+            let validated_generation = tenant_generations
+                .get(&tenant_id)
+                .expect("Map was built from the same keys we're reading");
+
+            let valid = tenants_valid
+                .get(&tenant_id)
+                .copied()
+                // If the tenant was missing from the validation response, it has been deleted.
+                // The Timeline that requested the LSN update is probably already torn down,
+                // or will be torn down soon.  In this case, drop the update by setting valid=false.
+                .unwrap_or(false);
+
+            if valid && *validated_generation == tenant_lsn_state.generation {
+                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                    pending_lsn.result_slot.store(pending_lsn.projected);
+                }
+            } else {
+                // If we failed validation, then do not apply any of the projected updates
+                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
+            }
+        }
+
+        // Apply the validation results to the pending deletion lists
+        for list in &mut self.pending_lists {
+            // Filter the list based on whether the server responded valid: true.
+            // If a tenant is omitted in the response, it has been deleted, and we should
+            // proceed with deletion.
+            let mut mutated = false;
+            list.tenants.retain(|tenant_id, tenant| {
+                let validated_generation = tenant_generations
+                    .get(tenant_id)
+                    .expect("Map was built from the same keys we're reading");
+
+                // If the tenant was missing from the validation response, it has been deleted.
+                // This means that a deletion is valid, but also redundant since the tenant's
+                // objects should have already been deleted.  Treat it as invalid to drop the
+                // redundant deletion.
+                let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
+
+                // A list is valid if it comes from the current _or previous_ generation.
+                // - The previous generation case is permitted due to how we store deletion lists locally:
+                // if we see the immediately previous generation in a locally stored deletion list,
+                // it proves that this node's disk was used for both current & previous generations,
+                // and therefore no other node was involved in between: the two generations may be
+                // logically treated as the same.
+                // - In that previous generation case, we rewrote it to the current generation
+                // in recover(), so the comparison here is simply an equality.
+
+                let this_list_valid = valid
+                    && (tenant.generation == *validated_generation);
+
+                if !this_list_valid {
+                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
+                    mutated = true;
+                }
+                this_list_valid
+            });
+            list.validated = true;
+
+            if mutated {
+                // Save the deletion list if we had to make changes due to stale generations.  The
+                // saved list is valid for execution.
+                if let Err(e) = list.save(self.conf).await {
+                    // Highly unexpected.  Could happen if e.g. disk full.
+                    // If we didn't save the trimmed list, it is _not_ valid to execute.
+                    warn!("Failed to save modified deletion list {list}: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+
+                    // Rather than have a complex retry process, just drop it and leak the objects,
+                    // scrubber will clean up eventually.
+                    list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
+
+                    // We must remember this failure, to prevent later writing out a header that
+                    // would imply the unwritable list was valid on disk.
+                    if self.list_write_failed.is_none() {
+                        self.list_write_failed = Some(list.sequence);
+                    }
+                }
+            }
+
+            validated_sequence = Some(list.sequence);
+        }
+
+        if let Some(validated_sequence) = validated_sequence {
+            if let Some(list_write_failed) = self.list_write_failed {
+                // Rare error case: we failed to write out a deletion list to excise invalid
+                // entries, so we cannot advance the header's valid sequence number past that point.
+                //
+                // In this state we will continue to validate, execute and delete deletion lists,
+                // we just cannot update the header.  It should be noticed and fixed by a human due to
+                // the nonzero value of our unexpected_errors metric.
+                warn!(
+                    sequence_number = list_write_failed,
+                    "Cannot write header because writing a deletion list failed earlier",
+                );
+            } else {
+                // Write the queue header to record how far validation progressed.  This avoids having
+                // to rewrite each DeletionList to set validated=true in it.
+                let header = DeletionHeader::new(validated_sequence);
+
+                // Drop result because the validated_sequence is an optimization.  If we fail to save it,
+                // then restart, we will drop some deletion lists, creating work for scrubber.
+                // The save() function logs a warning on error.
+                if let Err(e) = header.save(self.conf).await {
+                    warn!("Failed to write deletion queue header: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                }
+            }
+        }
+
+        // Transfer the validated lists to the validated queue, for eventual execution
+        self.validated_lists.append(&mut self.pending_lists);
+
+        Ok(())
+    }
+
+    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
+        for list_path in list_paths {
+            debug!("Removing deletion list {}", list_path.display());
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
+        }
+    }
+
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
+
+        // Issue any required generation validation calls to the control plane
+        self.validate().await?;
+
+        // After successful validation, nothing is pending: any lists that
+        // made it through validation will be in validated_lists.
+        assert!(self.pending_lists.is_empty());
+        self.pending_key_count = 0;
+
+        tracing::debug!(
+            "Validation complete, have {} validated lists",
+            self.validated_lists.len()
+        );
+
+        // Return quickly if we have no validated lists to execute.  This avoids flushing the
+        // executor when an idle backend hits its autoflush interval
+        if self.validated_lists.is_empty() {
+            return Ok(());
+        }
+
+        // Drain `validated_lists` into the executor
+        let mut executing_lists = Vec::new();
+        for list in self.validated_lists.drain(..) {
+            let list_path = self.conf.deletion_list_path(list.sequence);
+            let objects = list.into_remote_paths();
+            self.tx
+                .send(DeleterMessage::Delete(objects))
+                .await
+                .map_err(|_| DeletionQueueError::ShuttingDown)?;
+            executing_lists.push(list_path);
+        }
+
+        self.flush_executor().await?;
+
+        // Erase the deletion lists whose keys have all be deleted from remote storage
+        self.cleanup_lists(executing_lists).await;
+
+        Ok(())
+    }
+
+    async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
+        // Flush the executor, so that all the keys referenced by these deletion lists
+        // are actually removed from remote storage.  This is a precondition to deleting
+        // the deletion lists themselves.
+        let (flush_op, rx) = FlushOp::new();
+        self.tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    pub(super) async fn background(&mut self) {
+        tracing::info!("Started deletion backend worker");
+
+        while !self.cancel.is_cancelled() {
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    break;
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending.
+                    match self.flush().await {
+                        Ok(()) => {}
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we are shutting down, then auto-flush can safely be skipped
+                        }
+                    }
+
+                    continue;
+                }
+            };
+
+            match msg {
+                ValidatorQueueMessage::Delete(list) => {
+                    if list.validated {
+                        // A pre-validated list may only be seen during recovery, if we are recovering
+                        // a DeletionList whose on-disk state has validated=true
+                        self.validated_lists.push(list)
+                    } else {
+                        self.pending_key_count += list.len();
+                        self.pending_lists.push(list);
+                    }
+
+                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
+                        match self.flush().await {
+                            Ok(()) => {}
+                            Err(DeletionQueueError::ShuttingDown) => {
+                                // If we are shutting down, then auto-flush can safely be skipped
+                            }
+                        }
+                    }
+                }
+                ValidatorQueueMessage::Flush(op) => {
+                    match self.flush().await {
+                        Ok(()) => {
+                            op.notify();
+                        }
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we fail due to shutting down, we will just drop `op` to propagate that status.
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -570,7 +570,7 @@ async fn collect_eviction_candidates(
        tenant_candidates
            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;
-        for (timeline, layer_info) in tenant_candidates {
+        for (timeline, layer_info) in tenant_candidates.into_iter() {
            let file_size = layer_info.file_size();
            let candidate = EvictionCandidate {
                timeline,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -383,7 +383,6 @@ paths:
        schema:
          type: string
          format: hex
-
    post:
      description: |
        Schedules attach operation to happen in the background for the given tenant.
@@ -1020,6 +1019,9 @@ components:
      properties:
        config:
          $ref: '#/components/schemas/TenantConfig'
+        generation:
+          type: integer
+          description: Attachment generation number.
    TenantConfigRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
@@ -1091,6 +1093,9 @@ components:
        remote_consistent_lsn:
          type: string
          format: hex
+        remote_consistent_lsn_visible:
+          type: string
+          format: hex
        ancestor_timeline_id:
          type: string
          format: hex
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,12 +5,14 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
+use futures::TryFutureExt;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
+};
 use remote_storage::GenericRemoteStorage;
-use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -23,6 +25,7 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -32,11 +35,13 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
+use crate::tenant::timeline::Timeline;
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
    auth::JwtAuth,
+    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
        error::{ApiError, HttpErrorBody},
@@ -51,22 +56,24 @@ use utils::{
 // Imports only used for testing APIs
 use super::models::ConfigureFailpointsRequest;

-struct State {
+pub struct State {
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+    deletion_queue_client: DeletionQueueClient,
 }

 impl State {
-    fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
@@ -79,8 +86,17 @@ impl State {
            remote_storage,
            broker_client,
            disk_usage_eviction_state,
+            deletion_queue_client,
        })
    }
+
+    fn tenant_resources(&self) -> TenantSharedResources {
+        TenantSharedResources {
+            broker_client: self.broker_client.clone(),
+            remote_storage: self.remote_storage.clone(),
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
+    }
 }

 #[inline(always)]
@@ -280,7 +296,14 @@ async fn build_timeline_info_common(
    };
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
-    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+    let remote_consistent_lsn_projected = timeline
+        .get_remote_consistent_lsn_projected()
+        .unwrap_or(Lsn(0));
+    let remote_consistent_lsn_visible = timeline
+        .get_remote_consistent_lsn_visible()
+        .unwrap_or(Lsn(0));
+
+    let walreceiver_status = timeline.walreceiver_status();

    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
@@ -288,7 +311,8 @@ async fn build_timeline_info_common(
        ancestor_timeline_id,
        ancestor_lsn,
        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-        remote_consistent_lsn,
+        remote_consistent_lsn: remote_consistent_lsn_projected,
+        remote_consistent_lsn_visible,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -302,6 +326,8 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
+
+        walreceiver_status,
    };
    Ok(info)
 }
@@ -472,7 +498,7 @@ async fn tenant_attach_handler(
    check_permission(&request, Some(tenant_id))?;

    let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
-    let tenant_conf = match maybe_body {
+    let tenant_conf = match &maybe_body {
        Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
        None => TenantConfOpt::default(),
    };
@@ -483,23 +509,25 @@ async fn tenant_attach_handler(

    let state = get_state(&request);

-    if let Some(remote_storage) = &state.remote_storage {
-        mgr::attach_tenant(
-            state.conf,
-            tenant_id,
-            tenant_conf,
-            state.broker_client.clone(),
-            remote_storage.clone(),
-            &ctx,
-        )
-        .instrument(info_span!("tenant_attach", %tenant_id))
-        .await?;
-    } else {
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
+
+    if state.remote_storage.is_none() {
        return Err(ApiError::BadRequest(anyhow!(
            "attach_tenant is not possible because pageserver was configured without remote storage"
        )));
    }

+    mgr::attach_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        tenant_conf,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_attach", %tenant_id))
+    .await?;
+
    json_response(StatusCode::ACCEPTED, ())
 }

@@ -538,7 +566,7 @@ async fn tenant_detach_handler(
 }

 async fn tenant_load_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -546,12 +574,21 @@ async fn tenant_load_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

+    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
+
    let state = get_state(&request);
+
+    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
+    // should always use /attach instead.
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
+
    mgr::load_tenant(
        state.conf,
        tenant_id,
+        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
        &ctx,
    )
    .instrument(info_span!("load", %tenant_id))
@@ -851,6 +888,21 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

+/// Helper for requests that may take a generation, which is mandatory
+/// when control_plane_api is set, but otherwise defaults to Generation::none()
+fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
+    if state.conf.control_plane_api.is_some() {
+        req_gen
+            .map(Generation::new)
+            .ok_or(ApiError::BadRequest(anyhow!(
+                "generation attribute missing"
+            )))
+    } else {
+        // Legacy mode: all tenants operate with no generation
+        Ok(Generation::none())
+    }
+}
+
 async fn tenant_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -867,16 +919,18 @@ async fn tenant_create_handler(
    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
    let state = get_state(&request);

+    let generation = get_request_generation(state, request_data.generation)?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
    let new_tenant = mgr::create_tenant(
        state.conf,
        tenant_conf,
        target_tenant_id,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
+        generation,
+        state.tenant_resources(),
        &ctx,
    )
    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1093,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get(
    json_response(StatusCode::OK, info)
 }

+async fn deletion_queue_flush(
+    r: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&r);
+
+    if state.remote_storage.is_none() {
+        // Nothing to do if remote storage is disabled.
+        return json_response(StatusCode::OK, ());
+    }
+
+    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
+
+    let flush = async {
+        if execute {
+            state.deletion_queue_client.flush_execute().await
+        } else {
+            state.deletion_queue_client.flush().await
+        }
+    }
+    // DeletionQueueError's only case is shutting down.
+    .map_err(|_| ApiError::ShuttingDown);
+
+    tokio::select! {
+        res = flush => {
+            res.map(|()| json_response(StatusCode::OK, ()))?
+        }
+        _ = cancel.cancelled() => {
+            Err(ApiError::ShuttingDown)
+        }
+    }
+}
+
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -1321,12 +1408,9 @@ where
 }

 pub fn make_router(
-    conf: &'static PageServerConf,
+    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<JwtAuth>>,
-    broker_client: BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
-    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1350,16 +1434,7 @@ pub fn make_router(
    );

    Ok(router
-        .data(Arc::new(
-            State::new(
-                conf,
-                auth,
-                remote_storage,
-                broker_client,
-                disk_usage_eviction_state,
-            )
-            .context("Failed to initialize router state")?,
-        ))
+        .data(state)
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
@@ -1439,6 +1514,9 @@ pub fn make_router(
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
+        .put("/v1/deletion_queue/flush", |r| {
+            api_handler(r, deletion_queue_flush)
+        })
        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/Show More
+++ b/Show More